1*6777b538SAndroid Build Coastguard Worker // Copyright 2023 The Chromium Authors
2*6777b538SAndroid Build Coastguard Worker // Use of this source code is governed by a BSD-style license that can be
3*6777b538SAndroid Build Coastguard Worker // found in the LICENSE file.
4*6777b538SAndroid Build Coastguard Worker
5*6777b538SAndroid Build Coastguard Worker #include "base/strings/levenshtein_distance.h"
6*6777b538SAndroid Build Coastguard Worker
7*6777b538SAndroid Build Coastguard Worker #include <stddef.h>
8*6777b538SAndroid Build Coastguard Worker
9*6777b538SAndroid Build Coastguard Worker #include <algorithm>
10*6777b538SAndroid Build Coastguard Worker #include <numeric>
11*6777b538SAndroid Build Coastguard Worker #include <optional>
12*6777b538SAndroid Build Coastguard Worker #include <string_view>
13*6777b538SAndroid Build Coastguard Worker #include <vector>
14*6777b538SAndroid Build Coastguard Worker
15*6777b538SAndroid Build Coastguard Worker namespace base {
16*6777b538SAndroid Build Coastguard Worker
17*6777b538SAndroid Build Coastguard Worker namespace {
18*6777b538SAndroid Build Coastguard Worker
19*6777b538SAndroid Build Coastguard Worker template <typename CharT>
LevenshteinDistanceImpl(std::basic_string_view<CharT> a,std::basic_string_view<CharT> b,std::optional<size_t> max_distance)20*6777b538SAndroid Build Coastguard Worker size_t LevenshteinDistanceImpl(std::basic_string_view<CharT> a,
21*6777b538SAndroid Build Coastguard Worker std::basic_string_view<CharT> b,
22*6777b538SAndroid Build Coastguard Worker std::optional<size_t> max_distance) {
23*6777b538SAndroid Build Coastguard Worker if (a.size() > b.size()) {
24*6777b538SAndroid Build Coastguard Worker a.swap(b);
25*6777b538SAndroid Build Coastguard Worker }
26*6777b538SAndroid Build Coastguard Worker
27*6777b538SAndroid Build Coastguard Worker // max(a.size(), b.size()) steps always suffice.
28*6777b538SAndroid Build Coastguard Worker const size_t k = max_distance.value_or(b.size());
29*6777b538SAndroid Build Coastguard Worker // If the string's lengths differ by more than `k`, so does their
30*6777b538SAndroid Build Coastguard Worker // Levenshtein distance.
31*6777b538SAndroid Build Coastguard Worker if (a.size() + k < b.size()) {
32*6777b538SAndroid Build Coastguard Worker return k + 1;
33*6777b538SAndroid Build Coastguard Worker }
34*6777b538SAndroid Build Coastguard Worker // The classical Levenshtein distance DP defines dp[i][j] as the minimum
35*6777b538SAndroid Build Coastguard Worker // number of insert, remove and replace operation to convert a[:i] to b[:j].
36*6777b538SAndroid Build Coastguard Worker // To make this more efficient, one can define dp[i][d] as the distance of
37*6777b538SAndroid Build Coastguard Worker // a[:i] and b[:i + d]. Intuitively, d represents the delta between j and i in
38*6777b538SAndroid Build Coastguard Worker // the former dp. Since the Levenshtein distance is restricted by `k`, abs(d)
39*6777b538SAndroid Build Coastguard Worker // can be bounded by `k`. Since dp[i][d] only depends on values from dp[i-1],
40*6777b538SAndroid Build Coastguard Worker // it is not necessary to store the entire 2D table. Instead, this code just
41*6777b538SAndroid Build Coastguard Worker // stores the d-dimension, which represents "the distance with the current
42*6777b538SAndroid Build Coastguard Worker // prefix of the string, for a given delta d". Since d is between `-k` and
43*6777b538SAndroid Build Coastguard Worker // `k`, the implementation shifts the d-index by `k`, bringing it in range
44*6777b538SAndroid Build Coastguard Worker // [0, `2*k`].
45*6777b538SAndroid Build Coastguard Worker
46*6777b538SAndroid Build Coastguard Worker // The algorithm only cares if the Levenshtein distance is at most `k`. Thus,
47*6777b538SAndroid Build Coastguard Worker // any unreachable states and states in which the distance is certainly larger
48*6777b538SAndroid Build Coastguard Worker // than `k` can be set to any value larger than `k`, without affecting the
49*6777b538SAndroid Build Coastguard Worker // result.
50*6777b538SAndroid Build Coastguard Worker const size_t kInfinity = k + 1;
51*6777b538SAndroid Build Coastguard Worker std::vector<size_t> dp(2 * k + 1, kInfinity);
52*6777b538SAndroid Build Coastguard Worker // Initially, `dp[d]` represents the Levenshtein distance of the empty prefix
53*6777b538SAndroid Build Coastguard Worker // of `a` and the first j = d - k characters of `b`. Their distance is j,
54*6777b538SAndroid Build Coastguard Worker // since j removals are required. States with negative d are not reachable,
55*6777b538SAndroid Build Coastguard Worker // since that corresponds to a negative index into `b`.
56*6777b538SAndroid Build Coastguard Worker std::iota(dp.begin() + static_cast<long>(k), dp.end(), 0);
57*6777b538SAndroid Build Coastguard Worker for (size_t i = 0; i < a.size(); i++) {
58*6777b538SAndroid Build Coastguard Worker // Right now, `dp` represents the Levenshtein distance when considering the
59*6777b538SAndroid Build Coastguard Worker // first `i` characters (up to index `i-1`) of `a`. After the next loop,
60*6777b538SAndroid Build Coastguard Worker // `dp` will represent the Levenshtein distance when considering the first
61*6777b538SAndroid Build Coastguard Worker // `i+1` characters.
62*6777b538SAndroid Build Coastguard Worker for (size_t d = 0; d <= 2 * k; d++) {
63*6777b538SAndroid Build Coastguard Worker if (i + d < k || i + d >= b.size() + k) {
64*6777b538SAndroid Build Coastguard Worker // `j = i + d - k` is out of range of `b`. Since j == -1 corresponds to
65*6777b538SAndroid Build Coastguard Worker // the empty prefix of `b`, the distance is i + 1 in this case.
66*6777b538SAndroid Build Coastguard Worker dp[d] = i + d + 1 == k ? i + 1 : kInfinity;
67*6777b538SAndroid Build Coastguard Worker continue;
68*6777b538SAndroid Build Coastguard Worker }
69*6777b538SAndroid Build Coastguard Worker const size_t j = i + d - k;
70*6777b538SAndroid Build Coastguard Worker // If `a[i] == `b[j]` the Levenshtein distance for `d` remained the same.
71*6777b538SAndroid Build Coastguard Worker if (a[i] != b[j]) {
72*6777b538SAndroid Build Coastguard Worker // (i, j) -> (i-1, j-1), `d` stays the same.
73*6777b538SAndroid Build Coastguard Worker const size_t replace = dp[d];
74*6777b538SAndroid Build Coastguard Worker // (i, j) -> (i-1, j), `d` increases by 1.
75*6777b538SAndroid Build Coastguard Worker // If the distance between `i` and `j` becomes larger than `k`, their
76*6777b538SAndroid Build Coastguard Worker // distance is at least `k + 1`. Same in the `insert` case.
77*6777b538SAndroid Build Coastguard Worker const size_t remove = d != 2 * k ? dp[d + 1] : kInfinity;
78*6777b538SAndroid Build Coastguard Worker // (i, j) -> (i, j-1), `d` decreases by 1. Since `i` stays the same,
79*6777b538SAndroid Build Coastguard Worker // this is intentionally using the dp value updated in the previous
80*6777b538SAndroid Build Coastguard Worker // iteration.
81*6777b538SAndroid Build Coastguard Worker const size_t insert = d != 0 ? dp[d - 1] : kInfinity;
82*6777b538SAndroid Build Coastguard Worker dp[d] = 1 + std::min({replace, remove, insert});
83*6777b538SAndroid Build Coastguard Worker }
84*6777b538SAndroid Build Coastguard Worker }
85*6777b538SAndroid Build Coastguard Worker }
86*6777b538SAndroid Build Coastguard Worker return std::min(dp[b.size() + k - a.size()], k + 1);
87*6777b538SAndroid Build Coastguard Worker }
88*6777b538SAndroid Build Coastguard Worker
89*6777b538SAndroid Build Coastguard Worker } // namespace
90*6777b538SAndroid Build Coastguard Worker
LevenshteinDistance(std::string_view a,std::string_view b,std::optional<size_t> max_distance)91*6777b538SAndroid Build Coastguard Worker size_t LevenshteinDistance(std::string_view a,
92*6777b538SAndroid Build Coastguard Worker std::string_view b,
93*6777b538SAndroid Build Coastguard Worker std::optional<size_t> max_distance) {
94*6777b538SAndroid Build Coastguard Worker return LevenshteinDistanceImpl(a, b, max_distance);
95*6777b538SAndroid Build Coastguard Worker }
LevenshteinDistance(std::u16string_view a,std::u16string_view b,std::optional<size_t> max_distance)96*6777b538SAndroid Build Coastguard Worker size_t LevenshteinDistance(std::u16string_view a,
97*6777b538SAndroid Build Coastguard Worker std::u16string_view b,
98*6777b538SAndroid Build Coastguard Worker std::optional<size_t> max_distance) {
99*6777b538SAndroid Build Coastguard Worker return LevenshteinDistanceImpl(a, b, max_distance);
100*6777b538SAndroid Build Coastguard Worker }
101*6777b538SAndroid Build Coastguard Worker
102*6777b538SAndroid Build Coastguard Worker } // namespace base
103