001// License: GPL. For details, see LICENSE file.
002package org.openstreetmap.josm.data.validation.tests;
003
004import static java.util.regex.Pattern.CASE_INSENSITIVE;
005import static java.util.regex.Pattern.UNICODE_CASE;
006import static org.openstreetmap.josm.tools.I18n.tr;
007
008import java.awt.geom.Point2D;
009import java.util.ArrayList;
010import java.util.Arrays;
011import java.util.HashMap;
012import java.util.List;
013import java.util.Locale;
014import java.util.Map;
015import java.util.regex.Matcher;
016import java.util.regex.Pattern;
017
018import org.openstreetmap.josm.data.osm.OsmPrimitive;
019import org.openstreetmap.josm.data.osm.Way;
020import org.openstreetmap.josm.data.validation.Severity;
021import org.openstreetmap.josm.data.validation.Test;
022import org.openstreetmap.josm.data.validation.TestError;
023import org.openstreetmap.josm.data.validation.util.ValUtil;
024import org.openstreetmap.josm.gui.progress.ProgressMonitor;
025import org.openstreetmap.josm.tools.MultiMap;
026import org.openstreetmap.josm.tools.Utils;
027
028/**
029 * Checks for similar named ways, symptom of a possible typo. It uses the
030 * Levenshtein distance to check for similarity
031 *
032 * @author frsantos
033 */
034public class SimilarNamedWays extends Test {
035
036    protected static final int SIMILAR_NAMED = 701;
037
038    /** All ways, grouped by cells */
039    private Map<Point2D, List<Way>> cellWays;
040    /** The already detected errors */
041    private MultiMap<Way, Way> errorWays;
042
043    private final List<NormalizeRule> rules = new ArrayList<>();
044
045    /**
046     * Constructor
047     */
048    public SimilarNamedWays() {
049        super(tr("Similarly named ways"),
050                tr("This test checks for ways with similar names that may have been misspelled."));
051
052        // FIXME: hardcode these rules for now. Replace them with preferences later
053        // See https://josm.openstreetmap.de/ticket/3733#comment:19
054        addRegExprRule("\\pN+", "0"); // Unicode numbers: matches "Highway 66" but also persian numbers
055        addRegExprRule("M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "0"); // Roman numbers: matches "Building II"
056        addRegExprRule("\\d+(st|nd|rd|th)", "0st"); // 3rd Ave
057        addRegExprRule("^[A-Z] ", "X"); // E Street
058        addSynonyms("east", "west", "north", "south");
059        addSynonyms("first", "second", "third");
060    }
061
062    @Override
063    public void startTest(ProgressMonitor monitor) {
064        super.startTest(monitor);
065        cellWays = new HashMap<>(1000);
066        errorWays = new MultiMap<>();
067    }
068
069    @Override
070    public void endTest() {
071        cellWays = null;
072        errorWays = null;
073        super.endTest();
074    }
075
076    @Override
077    public void visit(Way w) {
078        if (!w.isUsable())
079            return;
080
081        String name = w.get("name");
082        if (name == null || name.length() < 6)
083            return;
084
085        List<List<Way>> theCellWays = ValUtil.getWaysInCell(w, cellWays);
086        for (List<Way> ways : theCellWays) {
087            for (Way w2 : ways) {
088                if (errorWays.contains(w, w2) || errorWays.contains(w2, w)) {
089                    continue;
090                }
091
092                String name2 = w2.get("name");
093                if (name2 == null || name2.length() < 6) {
094                    continue;
095                }
096
097                if (similaryName(name, name2)) {
098                    List<OsmPrimitive> primitives = new ArrayList<>(2);
099                    primitives.add(w);
100                    primitives.add(w2);
101                    errors.add(TestError.builder(this, Severity.WARNING, SIMILAR_NAMED)
102                            .message(tr("Similarly named ways"))
103                            .primitives(primitives)
104                            .build());
105                    errorWays.put(w, w2);
106                }
107            }
108            ways.add(w);
109        }
110    }
111
112    /**
113     * Compute Levenshtein distance
114     *
115     * @param s First word
116     * @param t Second word
117     * @return The distance between words
118     */
119    public static int getLevenshteinDistance(String s, String t) {
120        int[][] d; // matrix
121        int n; // length of s
122        int m; // length of t
123        int i; // iterates through s
124        int j; // iterates through t
125        char si; // ith character of s
126        char tj; // jth character of t
127        int cost; // cost
128
129        // Step 1
130        n = s.length();
131        m = t.length();
132        if (n == 0)
133            return m;
134        if (m == 0)
135            return n;
136        d = new int[n+1][m+1];
137
138        // Step 2
139        for (i = 0; i <= n; i++) {
140            d[i][0] = i;
141        }
142        for (j = 0; j <= m; j++) {
143            d[0][j] = j;
144        }
145
146        // Step 3
147        for (i = 1; i <= n; i++) {
148
149            si = s.charAt(i - 1);
150
151            // Step 4
152            for (j = 1; j <= m; j++) {
153
154                tj = t.charAt(j - 1);
155
156                // Step 5
157                if (si == tj) {
158                    cost = 0;
159                } else {
160                    cost = 1;
161                }
162
163                // Step 6
164                d[i][j] = Math.min(Math.min(d[i - 1][j] + 1, d[i][j - 1] + 1), d[i - 1][j - 1] + cost);
165            }
166        }
167
168        // Step 7
169        return d[n][m];
170    }
171
172    /**
173     * Add a regular expression rule.
174     * @param regExpr the regular expression to search for
175     * @param replacement a string to replace with, which should match the expression.
176     */
177    public void addRegExprRule(String regExpr, String replacement) {
178        rules.add(new RegExprRule(regExpr, replacement));
179    }
180
181    /**
182     * Add a rule with synonym words.
183     * @param words words which are synonyms
184     */
185    public void addSynonyms(String... words) {
186        for (String word : words) {
187            rules.add(new SynonymRule(word, words));
188        }
189    }
190
191    /**
192     * Check if two names are similar, but not identical. First both names will be "normalized".
193     * Afterwards the Levenshtein distance will be calculated.<br>
194     * Examples for normalization rules:<br>
195     * <code>replaceAll("\\d+", "0")</code><br>
196     * would cause similaryName("track 1", "track 2") = false, but similaryName("Track 1", "track 2") = true
197     * @param name first name to compare
198     * @param name2 second name to compare
199     * @return true if the normalized names are different but only a "little bit"
200     */
201    public boolean similaryName(String name, String name2) {
202        // check plain strings
203        int distance = getLevenshteinDistance(name, name2);
204        boolean similar = distance > 0 && distance <= 2;
205
206        // check if only the case differs, so we don't consider large distance as different strings
207        if (distance > 2 && name.length() == name2.length()) {
208            similar = Utils.deAccent(name).equalsIgnoreCase(Utils.deAccent(name2));
209        }
210
211        // try all rules
212        for (NormalizeRule rule : rules) {
213            int levenshteinDistance = getLevenshteinDistance(rule.normalize(name), rule.normalize(name2));
214            if (levenshteinDistance == 0)
215                // one rule results in identical names: identical
216                return false;
217            else if (levenshteinDistance <= 2) {
218                // 0 < distance <= 2
219                similar = true;
220            }
221        }
222        return similar;
223    }
224
225    /**
226     * A normalization that is applied to names before testing them
227     */
228    @FunctionalInterface
229    public interface NormalizeRule {
230
231        /**
232         * Normalize the string by replacing parts.
233         * @param name name to normalize
234         * @return normalized string
235         */
236        String normalize(String name);
237    }
238
239    /**
240     * A rule to replace by regular expression,
241     * so that all strings matching the regular expression are handled as if they were {@link RegExprRule#replacement}
242     */
243    public static class RegExprRule implements NormalizeRule {
244        private final Pattern regExpr;
245        private final String replacement;
246
247        /**
248         * Create a new rule to replace by regular expression
249         * @param expression The regular expression
250         * @param replacement The replacement
251         */
252        public RegExprRule(String expression, String replacement) {
253            this.regExpr = Pattern.compile(expression);
254            this.replacement = replacement;
255        }
256
257        @Override
258        public String normalize(String name) {
259            return regExpr.matcher(name).replaceAll(replacement);
260        }
261
262        @Override
263        public String toString() {
264            return "replaceAll(" + regExpr + ", " + replacement + ')';
265        }
266    }
267
268    /**
269     * A rule that registers synonyms to a given word
270     */
271    public static class SynonymRule implements NormalizeRule {
272
273        private final String[] words;
274        private final Pattern regExpr;
275        private final String replacement;
276
277        /**
278         * Create a new {@link SynonymRule}
279         * @param replacement The word to use instead
280         * @param words The synonyms for that word
281         */
282        public SynonymRule(String replacement, String... words) {
283            this.replacement = replacement.toLowerCase(Locale.ENGLISH);
284            this.words = words;
285
286            // build regular expression for other words (for fast match)
287            StringBuilder expression = new StringBuilder();
288            int maxLength = 0;
289            for (int i = 0; i < words.length; i++) {
290                if (words[i].length() > maxLength) {
291                    maxLength = words[i].length();
292                }
293                if (expression.length() > 0) {
294                    expression.append('|');
295                }
296                expression.append(Pattern.quote(words[i]));
297            }
298            this.regExpr = Pattern.compile(expression.toString(), CASE_INSENSITIVE + UNICODE_CASE);
299        }
300
301        @Override
302        public String normalize(String name) {
303            // find first match
304            Matcher matcher = regExpr.matcher(name);
305            if (!matcher.find())
306                return name;
307
308            int start = matcher.start();
309
310            // which word matches?
311            String part = "";
312            for (int i = 0; i < words.length; i++) {
313                String word = words[i];
314                part = name.substring(start, start + word.length());
315                if (word.equalsIgnoreCase(part)) {
316                    break;
317                }
318            }
319
320            // replace the word
321            char[] newName = matcher.replaceFirst(replacement).toCharArray();
322
323            // adjust case (replacement is not shorter than matching word!)
324            int minLength = Math.min(replacement.length(), part.length());
325            for (int i = 0; i < minLength; i++) {
326                if (Character.isUpperCase(part.charAt(i))) {
327                    newName[start + i] = Character.toUpperCase(newName[start + i]);
328                }
329            }
330
331            return new String(newName);
332        }
333
334        @Override
335        public String toString() {
336            return "synonyms(" + replacement + ", " + Arrays.toString(words) + ')';
337        }
338    }
339}