001// License: GPL. For details, see LICENSE file. 002package org.openstreetmap.josm.data.validation.tests; 003 004import static java.util.regex.Pattern.CASE_INSENSITIVE; 005import static java.util.regex.Pattern.UNICODE_CASE; 006import static org.openstreetmap.josm.tools.I18n.tr; 007 008import java.awt.geom.Point2D; 009import java.util.ArrayList; 010import java.util.Arrays; 011import java.util.HashMap; 012import java.util.List; 013import java.util.Locale; 014import java.util.Map; 015import java.util.regex.Matcher; 016import java.util.regex.Pattern; 017 018import org.openstreetmap.josm.data.osm.OsmPrimitive; 019import org.openstreetmap.josm.data.osm.Way; 020import org.openstreetmap.josm.data.validation.Severity; 021import org.openstreetmap.josm.data.validation.Test; 022import org.openstreetmap.josm.data.validation.TestError; 023import org.openstreetmap.josm.data.validation.util.ValUtil; 024import org.openstreetmap.josm.gui.progress.ProgressMonitor; 025import org.openstreetmap.josm.tools.MultiMap; 026import org.openstreetmap.josm.tools.Utils; 027 028/** 029 * Checks for similar named ways, symptom of a possible typo. It uses the 030 * Levenshtein distance to check for similarity 031 * 032 * @author frsantos 033 */ 034public class SimilarNamedWays extends Test { 035 036 protected static final int SIMILAR_NAMED = 701; 037 038 /** All ways, grouped by cells */ 039 private Map<Point2D, List<Way>> cellWays; 040 /** The already detected errors */ 041 private MultiMap<Way, Way> errorWays; 042 043 private final List<NormalizeRule> rules = new ArrayList<>(); 044 045 /** 046 * Constructor 047 */ 048 public SimilarNamedWays() { 049 super(tr("Similarly named ways"), 050 tr("This test checks for ways with similar names that may have been misspelled.")); 051 052 // FIXME: hardcode these rules for now. Replace them with preferences later 053 // See https://josm.openstreetmap.de/ticket/3733#comment:19 054 addRegExprRule("\\pN+", "0"); // Unicode numbers: matches "Highway 66" but also persian numbers 055 addRegExprRule("M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "0"); // Roman numbers: matches "Building II" 056 addRegExprRule("\\d+(st|nd|rd|th)", "0st"); // 3rd Ave 057 addRegExprRule("^[A-Z] ", "X"); // E Street 058 addSynonyms("east", "west", "north", "south"); 059 addSynonyms("first", "second", "third"); 060 } 061 062 @Override 063 public void startTest(ProgressMonitor monitor) { 064 super.startTest(monitor); 065 cellWays = new HashMap<>(1000); 066 errorWays = new MultiMap<>(); 067 } 068 069 @Override 070 public void endTest() { 071 cellWays = null; 072 errorWays = null; 073 super.endTest(); 074 } 075 076 @Override 077 public void visit(Way w) { 078 if (!w.isUsable()) 079 return; 080 081 String name = w.get("name"); 082 if (name == null || name.length() < 6) 083 return; 084 085 List<List<Way>> theCellWays = ValUtil.getWaysInCell(w, cellWays); 086 for (List<Way> ways : theCellWays) { 087 for (Way w2 : ways) { 088 if (errorWays.contains(w, w2) || errorWays.contains(w2, w)) { 089 continue; 090 } 091 092 String name2 = w2.get("name"); 093 if (name2 == null || name2.length() < 6) { 094 continue; 095 } 096 097 if (similaryName(name, name2)) { 098 List<OsmPrimitive> primitives = new ArrayList<>(2); 099 primitives.add(w); 100 primitives.add(w2); 101 errors.add(TestError.builder(this, Severity.WARNING, SIMILAR_NAMED) 102 .message(tr("Similarly named ways")) 103 .primitives(primitives) 104 .build()); 105 errorWays.put(w, w2); 106 } 107 } 108 ways.add(w); 109 } 110 } 111 112 /** 113 * Compute Levenshtein distance 114 * 115 * @param s First word 116 * @param t Second word 117 * @return The distance between words 118 */ 119 public static int getLevenshteinDistance(String s, String t) { 120 int[][] d; // matrix 121 int n; // length of s 122 int m; // length of t 123 int i; // iterates through s 124 int j; // iterates through t 125 char si; // ith character of s 126 char tj; // jth character of t 127 int cost; // cost 128 129 // Step 1 130 n = s.length(); 131 m = t.length(); 132 if (n == 0) 133 return m; 134 if (m == 0) 135 return n; 136 d = new int[n+1][m+1]; 137 138 // Step 2 139 for (i = 0; i <= n; i++) { 140 d[i][0] = i; 141 } 142 for (j = 0; j <= m; j++) { 143 d[0][j] = j; 144 } 145 146 // Step 3 147 for (i = 1; i <= n; i++) { 148 149 si = s.charAt(i - 1); 150 151 // Step 4 152 for (j = 1; j <= m; j++) { 153 154 tj = t.charAt(j - 1); 155 156 // Step 5 157 if (si == tj) { 158 cost = 0; 159 } else { 160 cost = 1; 161 } 162 163 // Step 6 164 d[i][j] = Math.min(Math.min(d[i - 1][j] + 1, d[i][j - 1] + 1), d[i - 1][j - 1] + cost); 165 } 166 } 167 168 // Step 7 169 return d[n][m]; 170 } 171 172 /** 173 * Add a regular expression rule. 174 * @param regExpr the regular expression to search for 175 * @param replacement a string to replace with, which should match the expression. 176 */ 177 public void addRegExprRule(String regExpr, String replacement) { 178 rules.add(new RegExprRule(regExpr, replacement)); 179 } 180 181 /** 182 * Add a rule with synonym words. 183 * @param words words which are synonyms 184 */ 185 public void addSynonyms(String... words) { 186 for (String word : words) { 187 rules.add(new SynonymRule(word, words)); 188 } 189 } 190 191 /** 192 * Check if two names are similar, but not identical. First both names will be "normalized". 193 * Afterwards the Levenshtein distance will be calculated.<br> 194 * Examples for normalization rules:<br> 195 * <code>replaceAll("\\d+", "0")</code><br> 196 * would cause similaryName("track 1", "track 2") = false, but similaryName("Track 1", "track 2") = true 197 * @param name first name to compare 198 * @param name2 second name to compare 199 * @return true if the normalized names are different but only a "little bit" 200 */ 201 public boolean similaryName(String name, String name2) { 202 // check plain strings 203 int distance = getLevenshteinDistance(name, name2); 204 boolean similar = distance > 0 && distance <= 2; 205 206 // check if only the case differs, so we don't consider large distance as different strings 207 if (distance > 2 && name.length() == name2.length()) { 208 similar = Utils.deAccent(name).equalsIgnoreCase(Utils.deAccent(name2)); 209 } 210 211 // try all rules 212 for (NormalizeRule rule : rules) { 213 int levenshteinDistance = getLevenshteinDistance(rule.normalize(name), rule.normalize(name2)); 214 if (levenshteinDistance == 0) 215 // one rule results in identical names: identical 216 return false; 217 else if (levenshteinDistance <= 2) { 218 // 0 < distance <= 2 219 similar = true; 220 } 221 } 222 return similar; 223 } 224 225 /** 226 * A normalization that is applied to names before testing them 227 */ 228 @FunctionalInterface 229 public interface NormalizeRule { 230 231 /** 232 * Normalize the string by replacing parts. 233 * @param name name to normalize 234 * @return normalized string 235 */ 236 String normalize(String name); 237 } 238 239 /** 240 * A rule to replace by regular expression, 241 * so that all strings matching the regular expression are handled as if they were {@link RegExprRule#replacement} 242 */ 243 public static class RegExprRule implements NormalizeRule { 244 private final Pattern regExpr; 245 private final String replacement; 246 247 /** 248 * Create a new rule to replace by regular expression 249 * @param expression The regular expression 250 * @param replacement The replacement 251 */ 252 public RegExprRule(String expression, String replacement) { 253 this.regExpr = Pattern.compile(expression); 254 this.replacement = replacement; 255 } 256 257 @Override 258 public String normalize(String name) { 259 return regExpr.matcher(name).replaceAll(replacement); 260 } 261 262 @Override 263 public String toString() { 264 return "replaceAll(" + regExpr + ", " + replacement + ')'; 265 } 266 } 267 268 /** 269 * A rule that registers synonyms to a given word 270 */ 271 public static class SynonymRule implements NormalizeRule { 272 273 private final String[] words; 274 private final Pattern regExpr; 275 private final String replacement; 276 277 /** 278 * Create a new {@link SynonymRule} 279 * @param replacement The word to use instead 280 * @param words The synonyms for that word 281 */ 282 public SynonymRule(String replacement, String... words) { 283 this.replacement = replacement.toLowerCase(Locale.ENGLISH); 284 this.words = words; 285 286 // build regular expression for other words (for fast match) 287 StringBuilder expression = new StringBuilder(); 288 int maxLength = 0; 289 for (int i = 0; i < words.length; i++) { 290 if (words[i].length() > maxLength) { 291 maxLength = words[i].length(); 292 } 293 if (expression.length() > 0) { 294 expression.append('|'); 295 } 296 expression.append(Pattern.quote(words[i])); 297 } 298 this.regExpr = Pattern.compile(expression.toString(), CASE_INSENSITIVE + UNICODE_CASE); 299 } 300 301 @Override 302 public String normalize(String name) { 303 // find first match 304 Matcher matcher = regExpr.matcher(name); 305 if (!matcher.find()) 306 return name; 307 308 int start = matcher.start(); 309 310 // which word matches? 311 String part = ""; 312 for (int i = 0; i < words.length; i++) { 313 String word = words[i]; 314 part = name.substring(start, start + word.length()); 315 if (word.equalsIgnoreCase(part)) { 316 break; 317 } 318 } 319 320 // replace the word 321 char[] newName = matcher.replaceFirst(replacement).toCharArray(); 322 323 // adjust case (replacement is not shorter than matching word!) 324 int minLength = Math.min(replacement.length(), part.length()); 325 for (int i = 0; i < minLength; i++) { 326 if (Character.isUpperCase(part.charAt(i))) { 327 newName[start + i] = Character.toUpperCase(newName[start + i]); 328 } 329 } 330 331 return new String(newName); 332 } 333 334 @Override 335 public String toString() { 336 return "synonyms(" + replacement + ", " + Arrays.toString(words) + ')'; 337 } 338 } 339}