@@ -46,6 +46,62 @@ pub fn lev_distance(a: &str, b: &str, limit: usize) -> Option<usize> {
4646 ( dcol[ m] <= limit) . then_some ( dcol[ m] )
4747}
4848
49+ /// Provides a word similarity score between two words that accounts for substrings being more
50+ /// meaningful than a typical Levenshtein distance. The lower the score, the closer the match.
51+ /// 0 is an identical match.
52+ ///
53+ /// Uses the Levenshtein distance between the two strings and removes the cost of the length
54+ /// difference. If this is 0 then it is either a substring match or a full word match, in the
55+ /// substring match case we detect this and return `1`. To prevent finding meaningless substrings,
56+ /// eg. "in" in "shrink", we only perform this subtraction of length difference if one of the words
57+ /// is not greater than twice the length of the other. For cases where the words are close in size
58+ /// but not an exact substring then the cost of the length difference is discounted by half.
59+ ///
60+ /// Returns `None` if the distance exceeds the limit.
61+ pub fn lev_distance_with_substrings ( a : & str , b : & str , limit : usize ) -> Option < usize > {
62+ let n = a. chars ( ) . count ( ) ;
63+ let m = b. chars ( ) . count ( ) ;
64+
65+ // Check one isn't less than half the length of the other. If this is true then there is a
66+ // big difference in length.
67+ let big_len_diff = ( n * 2 ) < m || ( m * 2 ) < n;
68+ let len_diff = if n < m { m - n } else { n - m } ;
69+ let lev = lev_distance ( a, b, limit + len_diff) ?;
70+
71+ // This is the crux, subtracting length difference means exact substring matches will now be 0
72+ let score = lev - len_diff;
73+
74+ // If the score is 0 but the words have different lengths then it's a substring match not a full
75+ // word match
76+ let score = if score == 0 && len_diff > 0 && !big_len_diff {
77+ 1 // Exact substring match, but not a total word match so return non-zero
78+ } else if !big_len_diff {
79+ // Not a big difference in length, discount cost of length difference
80+ score + ( len_diff + 1 ) / 2
81+ } else {
82+ // A big difference in length, add back the difference in length to the score
83+ score + len_diff
84+ } ;
85+
86+ ( score <= limit) . then_some ( score)
87+ }
88+
89+ /// Finds the best match for given word in the given iterator where substrings are meaningful.
90+ ///
91+ /// A version of [`find_best_match_for_name`] that uses [`lev_distance_with_substrings`] as the score
92+ /// for word similarity. This takes an optional distance limit which defaults to one-third of the
93+ /// given word.
94+ ///
95+ /// Besides the modified Levenshtein, we use case insensitive comparison to improve accuracy
96+ /// on an edge case with a lower(upper)case letters mismatch.
97+ pub fn find_best_match_for_name_with_substrings (
98+ candidates : & [ Symbol ] ,
99+ lookup : Symbol ,
100+ dist : Option < usize > ,
101+ ) -> Option < Symbol > {
102+ find_best_match_for_name_impl ( true , candidates, lookup, dist)
103+ }
104+
49105/// Finds the best match for a given word in the given iterator.
50106///
51107/// As a loose rule to avoid the obviously incorrect suggestions, it takes
@@ -54,11 +110,20 @@ pub fn lev_distance(a: &str, b: &str, limit: usize) -> Option<usize> {
54110///
55111/// Besides Levenshtein, we use case insensitive comparison to improve accuracy
56112/// on an edge case with a lower(upper)case letters mismatch.
57- #[ cold]
58113pub fn find_best_match_for_name (
59114 candidates : & [ Symbol ] ,
60115 lookup : Symbol ,
61116 dist : Option < usize > ,
117+ ) -> Option < Symbol > {
118+ find_best_match_for_name_impl ( false , candidates, lookup, dist)
119+ }
120+
121+ #[ cold]
122+ fn find_best_match_for_name_impl (
123+ use_substring_score : bool ,
124+ candidates : & [ Symbol ] ,
125+ lookup : Symbol ,
126+ dist : Option < usize > ,
62127) -> Option < Symbol > {
63128 let lookup = lookup. as_str ( ) ;
64129 let lookup_uppercase = lookup. to_uppercase ( ) ;
@@ -74,7 +139,11 @@ pub fn find_best_match_for_name(
74139 let mut dist = dist. unwrap_or_else ( || cmp:: max ( lookup. len ( ) , 3 ) / 3 ) ;
75140 let mut best = None ;
76141 for c in candidates {
77- match lev_distance ( lookup, c. as_str ( ) , dist) {
142+ match if use_substring_score {
143+ lev_distance_with_substrings ( lookup, c. as_str ( ) , dist)
144+ } else {
145+ lev_distance ( lookup, c. as_str ( ) , dist)
146+ } {
78147 Some ( 0 ) => return Some ( * c) ,
79148 Some ( d) => {
80149 dist = d - 1 ;
0 commit comments