Statement
Given a set of strings (in the general case, of different lengths) consisting of lowercase Latin letters.
Given a string (user request), also consisting of lowercase Latin letters.
Each line contains at most words.
The string also contains at most words.
Words are separated by spaces.
It is necessary to find the "most similar" string to the user's request from the set .
We will take into account that:
The user could swap words.
The user could enter only a part (subsequence) of the string they wanted to find in the set.
The user could have made typos: skipping letters, writing extra letters, replacing letters with others.
Solution
Let's consider the simplest solution.
Let's go through all the permutations of the query words .
For each word permutation , let's iterate over all strings and run the algorithm.
LCS — longest common subsequence.
From all strings we select those strings for which the value is the maximum. Also remember for which permutation this maximum LCS value was obtained. Denote the resulting set of strings as .
Now let's calculate the Levenshtein distance between the corresponding and .
We choose the only row for which the Levenshtein distance from the corresponding — is minimal. This line will be the answer.
Java code
public int F(char x, char y) { if (x == y) return 0; else return 1; } public int GetDist(String a, String b) { int[][] D = new int[100][100]; D[0][0] = 0; for (int i = 1; i < a.length(); i++) { D[i][0] = i; } for (int j = 1; j < b.length(); j++) { D[0][j] = j; } for (int i = 1; i < a.length(); i++) { for (int j = 1; j < b.length(); j++) { int m = F(a.charAt(i), b.charAt(j)); int val1 = D[i][j-1] + 1; int val2 = D[i-1][j] + 1; int val3 = D[i-1][j-1] + m; int minv = Math.min(val1, val2); minv = Math.min(minv, val3); D[i][j] = minv; } } return D[a.length()-1][b.length()-1]; } public String[] swap(String data[], int left, int right) { // Swap the data String temp = data[left]; data[left] = data[right]; data[right] = temp; // Return the updated array return data; } public int Compare(String a, String b) { for (int i = 0; i < Math.min(a.length(), b.length()); i++) { if (a.charAt(i) < b.charAt(i)) { return -1; } if (a.charAt(i) > b.charAt(i)) { return 1; } } if (a.length() < b.length()) return -1; if (a.length() > b.length()) return 1; return 0; } public String[] findNextPermutation(String p[]) { for (int a = p.length - 2; a >= 0; --a) { if (Compare(p[a], p[a+1]) == -1) { for (int b = p.length-1; ; --b) { if (Compare(p[b], p[a]) == 1) { String t = p[a]; p[a] = p[b]; p[b] = t; for (++a, b = p.length-1; a < b; ++a, --b) { t = p[a]; p[a] = p[b]; p[b] = t; } return p; } } } } return p; } public int Factorial(int n) { int fact = 1; for (int i = 2; i <= n; i++) { fact *= i; } return fact; } public int GetLenMaxSubSeq(String a, String b) { int[][] dp = new int[100][100]; dp[0][0] = 0; for (int i = 1; i <= a.length(); i++) { dp[i][0] = 0; } for (int j = 1; j <= b.length(); j++) { dp[0][j] = 0; } for (int i = 1; i <= a.length(); i++) { for (int j = 1; j <= b.length(); j++) { if (a.charAt(i-1) == b.charAt(j-1)) { dp[i][j] = dp[i-1][j-1] + 1; } else dp[i][j] = Math.max(dp[i-1][j], dp[i][j-1]); } } int maxv = 0; for (int i = 0; i <= a.length(); i++) { for (int j = 0; j <= b.length(); j++) { if (dp[i][j] > maxv) { maxv = dp[i][j]; } } } return maxv; } public Pair PermutateSubSeq(String a, String b) { int max_len = 0; String[] b_arr = b.split("\\s+"); Arrays.sort(b_arr); String res = ""; for (int cur_permutation = 0; cur_permutation < Factorial(b_arr.length); cur_permutation++) { int cur_len = GetLenMaxSubSeq(a, b); if (cur_len > max_len) { max_len = cur_len; res = b; } b_arr = findNextPermutation(b_arr); b = ""; for (int i = 0; i < b_arr.length-1; i++) { b += b_arr[i] + " "; } b += b_arr[b_arr.length-1]; } Pair pair = new Pair(); pair.Val = max_len; pair.str = res; return pair; } public int GetIndex(String name, int N) { int index = 0; int maxlensubseq = 0; int min_dist = 10000; for (int i = 0; i < N; i++) { Pair p = PermutateSubSeq(arr[i], name); int cur_subseq_len = p.Val; String str = p.str; if (cur_subseq_len > maxlensubseq) { maxlensubseq = cur_subseq_len; index = i; } if (cur_subseq_len == maxlensubseq) { int cur_dist = GetDist(arr[i], str); if (cur_dist < min_dist) { min_dist = cur_dist; index = i; } } } return index; }