Upgrade validations outdated_tags, suspicious_names to NSI v5

This commit is contained in:
Bryan Housel
2021-01-13 13:46:00 -05:00
parent 2d8c90786f
commit d3fb8c60f9
8 changed files with 210 additions and 109 deletions
+4 -4
View File
@@ -1767,9 +1767,9 @@ en:
message: "{feature} has incomplete tags"
reference: "Some features should have additional tags."
noncanonical_brand:
message: "{feature} looks like a brand with nonstandard tags"
message_incomplete: "{feature} looks like a brand with incomplete tags"
reference: "All features of the same brand should be tagged the same way."
message: "{feature} looks like a common feature with nonstandard tags"
message_incomplete: "{feature} looks like a common feature with incomplete tags"
reference: "Some features, for example retail chains or post offices, are expected to have certain tags in common."
point_as_area:
message: '{feature} should be a point, not an area'
point_as_line:
@@ -2332,4 +2332,4 @@ en:
wikidata:
identifier: "Identifier"
label: "Label"
description: "Description"
description: "Description"
+1 -1
View File
File diff suppressed because one or more lines are too long
-5
View File
@@ -20,11 +20,6 @@ export function coreFileFetcher() {
'languages': 'data/languages.min.json',
'locales': 'locales/index.min.json',
// old
'nsi_brands': 'https://cdn.jsdelivr.net/npm/name-suggestion-index@4/dist/brands.min.json',
'nsi_filters': 'https://cdn.jsdelivr.net/npm/name-suggestion-index@4/dist/filters.min.json',
// new
'nsi_presets': 'https://raw.githubusercontent.com/osmlab/name-suggestion-index/main/dist/presets/nsi-id-presets.min.json',
'nsi_data': 'https://raw.githubusercontent.com/osmlab/name-suggestion-index/main/dist/nsi.min.json',
'nsi_features': 'https://raw.githubusercontent.com/osmlab/name-suggestion-index/main/dist/featureCollection.min.json',
+191 -89
View File
@@ -1,15 +1,13 @@
import { t } from '../core/localizer';
import * as countryCoder from '@ideditor/country-coder';
import LocationConflation from '@ideditor/location-conflation';
import { matcher as Matcher } from 'name-suggestion-index';
import { presetManager } from '../presets';
import { fileFetcher } from '../core/file_fetcher';
import { fileFetcher, locationManager } from '../core';
import { actionChangePreset } from '../actions/change_preset';
import { actionChangeTags } from '../actions/change_tags';
import { actionUpgradeTags } from '../actions/upgrade_tags';
import { presetManager } from '../presets';
import { osmIsOldMultipolygonOuterMember, osmOldMultipolygonOuterMemberOfRelation } from '../osm/multipolygon';
import { utilDisplayLabel, utilTagDiff } from '../util';
import { utilArrayUniq, utilDisplayLabel, utilTagDiff } from '../util';
import { validationIssue, validationIssueFix } from '../core/validation';
@@ -18,7 +16,6 @@ let _nsi;
export function validationOutdatedTags() {
const type = 'outdated_tags';
const nsiKeys = ['amenity', 'shop', 'tourism', 'leisure', 'office'];
// A concern here in switching to async data means that `_dataDeprecated`
// and `_nsi` will not be available at first, so the data on early tiles
@@ -30,49 +27,98 @@ export function validationOutdatedTags() {
.catch(() => { /* ignore */ });
// console.log('NSI: start fetching..');
// // fetch the name-suggestion-index data
// Promise.all([
// fileFetcher.get('nsi_data'),
// fileFetcher.get('nsi_features'),
// fileFetcher.get('nsi_generics'),
// fileFetcher.get('nsi_replacements'),
// fileFetcher.get('nsi_trees')
// ])
// .then(vals => {
// _nsi = {
// data: vals[0].nsi,
// features: vals[1],
// generics: vals[2].genericWords,
// replacements: vals[3].replacements,
// trees: vals[4].trees
// };
function delay(msec) {
return new Promise(resolve => {
window.setTimeout(resolve, msec);
});
}
// console.log('NSI: done fetching..');
// console.log('NSI: start indexing..');
// This Promise will fullfill after NSI presets are loaded and locations merged into the locationManager.
function waitForNSIPresets() {
return Promise.all([
fileFetcher.get('nsi_presets'),
fileFetcher.get('nsi_features')
])
.then(() => delay(1000)) // wait 1 sec for locationSets to enter the locationManager queue
.then(() => locationManager.mergeLocationSets([]) );
}
// _nsi.loco = new LocationConflation(_nsi.features);
// _nsi.matcher = Matcher();
// _nsi.matcher.buildMatchIndex(_nsi.data);
// _nsi.matcher.buildLocationIndex(_nsi.data, _nsi.loco);
// Fetch the name-suggestion-index data
waitForNSIPresets()
.then(() => Promise.all([
fileFetcher.get('nsi_data'),
fileFetcher.get('nsi_replacements'),
fileFetcher.get('nsi_trees')
]))
.then(vals => {
if (_nsi) return _nsi;
// console.log('NSI: done indexing..');
_nsi = {
data: vals[0].nsi, // the raw name-suggestion-index data
replacements: vals[1].replacements, // trivial old->new qid replacements
trees: vals[2].trees, // metadata about trees, main tags
keys: new Set(), // primary osm keys to check for a NSI match
qids: new Map(), // Map wd/wp tag values -> qids
ids: new Map() // Map id -> NSI item
};
// // initialize name-suggestion-index matcher
// // _nsi.matcher.buildMatchIndex(d.brands);
_nsi.matcher = Matcher();
_nsi.matcher.buildMatchIndex(_nsi.data);
_nsi.matcher.buildLocationIndex(_nsi.data, locationManager.loco());
// // index all known wikipedia and wikidata tags
// // Object.keys(d.brands).forEach(kvnd => {
// // const brand = d.brands[kvnd];
// // const wd = brand.tags['brand:wikidata'];
// // const wp = brand.tags['brand:wikipedia'];
// // if (wd) { _nsi.wikidata[wd] = kvnd; }
// // if (wp) { _nsi.wikipedia[wp] = kvnd; }
// // });
Object.keys(_nsi.data).forEach(tkv => {
const parts = tkv.split('/', 3); // tkv = "tree/key/value"
const t = parts[0];
const k = parts[1];
// return _nsi;
// })
// .catch(() => { /* ignore */ });
// Collect primary keys (e.g. "amenity", "craft", "shop", "man_made", "route", etc)
_nsi.keys.add(k);
const tree = _nsi.trees[t]; // e.g. "brands", "operators"
const mainTag = tree.mainTag; // e.g. "brand:wikidata", "operator:wikidata", etc
const items = _nsi.data[tkv] || [];
items.forEach(item => {
// Cache NSI ids and main tags
item.mainTag = mainTag;
_nsi.ids.set(item.id, item);
// Cache Wikidata/Wikipedia values, for #6416
const wd = item.tags[mainTag];
const wp = item.tags[mainTag.replace('wikidata', 'wikipedia')];
if (wd) _nsi.qids.set(wd, wd);
if (wp && wd) _nsi.qids.set(wp, wd);
});
});
_nsi.keys.add('building'); // fallback can match building=* for some categories
return _nsi;
})
.catch(() => { /* ignore */ });
// Returns true if this tag key is a "namelike" tag that the NSI matcher would have indexed..
function isNamelike(k) {
const namePatterns = [
/^(flag:)?name$/i, // e.g. `name`, `flag:name`
/^(brand|country|flag|operator|network|subject)$/i,
/^\w+_name$/i, // e.g. `alt_name`, `short_name`
/^(name|brand|country|flag|operator|network|subject):\w+$/i, // e.g. `name:en`, `name:ru`
/^\w+_name:\w+$/i // e.g. `alt_name:en`, `short_name:ru`
];
return namePatterns.some(pattern => {
if (!pattern.test(k)) return false; // k is not a name tag, skip
// There are a few exceptions to the namelike regexes.
// Usually a tag suffix contains a language code like `name:en`, `name:ru`
// but we want to exclude things like `operator:type`, `name:etymology`, etc..
if (/:(colour|type|left|right|etymology|wikipedia)$/.test(k)) return false;
return true;
});
}
function oldTagIssues(entity, graph) {
@@ -81,7 +127,7 @@ export function validationOutdatedTags() {
let subtype = 'deprecated_tags';
if (!preset) return [];
// upgrade preset..
// Upgrade preset, if a replacement is available..
if (preset.replacement) {
const newPreset = presetManager.item(preset.replacement);
graph = actionChangePreset(entity.id, preset, newPreset, true /* skip field defaults */)(graph);
@@ -89,7 +135,7 @@ export function validationOutdatedTags() {
preset = newPreset;
}
// upgrade tags..
// Upgrade deprecated tags..
if (_dataDeprecated) {
const deprecatedTags = entity.deprecatedTags(_dataDeprecated);
if (deprecatedTags.length) {
@@ -100,7 +146,7 @@ export function validationOutdatedTags() {
}
}
// add missing addTags..
// Add missing addTags from the detected preset
let newTags = Object.assign({}, entity.tags); // shallow copy
if (preset.tags !== preset.addTags) {
Object.keys(preset.addTags).forEach(k => {
@@ -114,63 +160,119 @@ export function validationOutdatedTags() {
});
}
// Attempt to match a canonical record in the name-suggestion-index.
// This index contains the most correct tagging for many commonly mapped features.
// See https://github.com/osmlab/name-suggestion-index and https://nsi.guide
if (_nsi) {
// Do `wikidata` or `wikipedia` identify this entity as a brand? #6416
// If so, these tags can be swapped to `brand:wikidata`/`brand:wikipedia`
let isBrand;
if (newTags.wikidata) { // try matching `wikidata`
isBrand = _nsi.wikidata[newTags.wikidata];
}
if (!isBrand && newTags.wikipedia) { // fallback to `wikipedia`
isBrand = _nsi.wikipedia[newTags.wikipedia];
}
if (isBrand && !newTags.office) { // but avoid doing this for corporate offices
if (newTags.wikidata) {
newTags['brand:wikidata'] = newTags.wikidata;
delete newTags.wikidata;
// Perform trivial Wikipedia/Wikidata replacements
Object.keys(newTags).forEach(osmkey => {
const matchTag = osmkey.match(/^(\w+:)?wikidata$/);
if (matchTag) { // Look at '*:wikidata' tags
const prefix = (matchTag[1] || '');
const wd = newTags[osmkey];
const replace = _nsi.replacements[wd]; // If it matches a QID in the replacement list...
if (replace && replace.wikidata !== undefined) { // replace or delete `*:wikidata` tag
if (replace.wikidata) {
newTags[osmkey] = replace.wikidata;
} else {
delete newTags[osmkey];
}
}
if (replace && replace.wikipedia !== undefined) { // replace or delete `*:wikipedia` tag
const wpkey = `${prefix}wikipedia`;
if (replace.wikipedia) {
newTags[wpkey] = replace.wikipedia;
} else {
delete newTags[wpkey];
}
}
}
if (newTags.wikipedia) {
newTags['brand:wikipedia'] = newTags.wikipedia;
delete newTags.wikipedia;
});
// Do `wikidata` or `wikipedia` tags identify this entity as a chain? #6416
// If so, these tags can be swapped to e.g. `brand:wikidata`/`brand:wikipedia` below.
let foundQID = _nsi.qids.get(newTags.wikidata) || _nsi.qids.get(newTags.wikipedia);
// We will only spend time to compute these things if it's necessary
let names, loc, match;
// Try each primary key ("amenity", "craft", "shop", "man_made", "route", etc)
const nsiKeys = Array.from(_nsi.keys);
for (let i = 0; i < nsiKeys.length; i++) {
if (match) break; // matched already, stop looking
let k = nsiKeys[i];
let v = newTags[k];
if (!v) continue;
// Only attempt a match on building/yes if there is nothing else remarkable about that building.
if (k === 'building') {
v = 'yes';
if (preset.id !== 'building/yes') continue; // the feature matched a better preset
}
// I considered setting `name` and other tags here, but they aren't unique per wikidata
// (Q2759586 -> in USA "Papa John's", in Russia "Папа Джонс")
// So users will really need to use a preset or assign `name` themselves.
}
// try key/value|name match against name-suggestion-index
if (newTags.name) {
for (let i = 0; i < nsiKeys.length; i++) {
const k = nsiKeys[i];
if (!newTags[k]) continue;
if (!loc) { // collect location for this feature only once
loc = entity.extent(graph).center();
}
if (!names) { // collect names for this feature only once
names = Object.keys(newTags)
.map(k => isNamelike(k) ? newTags[k] : null)
.filter(Boolean);
const center = entity.extent(graph).center();
const countryCode = countryCoder.iso1A2Code(center);
const match = _nsi.matcher.matchKVN(k, newTags[k], newTags.name, countryCode && countryCode.toLowerCase());
if (!match) continue;
if (foundQID) names.unshift(foundQID); // matcher will recognize the QID as a name too
names = utilArrayUniq(names);
}
// for now skip ambiguous matches (like Target~(USA) vs Target~(Australia))
if (match.d) continue;
// Try each namelike value
for (let n = 0; n < names.length; n++) {
match = _nsi.matcher.match(k, v, names[n], loc); // Attempt to match an item in NSI
if (!match) continue; // keep looking
const brand = _nsi.brands[match.kvnd];
if (brand && brand.tags['brand:wikidata'] &&
brand.tags['brand:wikidata'] !== entity.tags['not:brand:wikidata']) {
subtype = 'noncanonical_brand';
// If we get here, there was a match..
// A match may contain multiple results, the first one is the best one for this location
// e.g. `['pfk-a54c14', 'kfc-1ff19c', 'kfc-658eea']`
const item = _nsi.ids.get(match[0]);
const mainTag = item.mainTag; // e.g. `brand:wikidata`
const itemQID = item.tags[mainTag]; // e.g. `brand:wikidata` qid
const notQID = newTags[`not:${mainTag}`]; // e.g. `not:brand:wikidata` qid
const keepTags = ['takeaway'].reduce((acc, k) => {
if (newTags[k]) {
acc[k] = newTags[k];
}
// Exceptions, throw out the match
if (
(!itemQID || itemQID === notQID) || // no `*:wikidata` or matched a `not:*:wikidata`
(newTags.office && !item.tags.office) // feature may be a coprorate office for a brand? - #6416
) {
match = null; // forget match and keep looking
continue; // (it might make sense to stop looking, not sure)
}
// We are keeping the match at this point
subtype = 'noncanonical_brand';
// Preserve some tags values that we don't want NSI to overwrite.
const keepTags = ['takeaway', 'building']
.reduce((acc, k) => {
if (newTags[k]) acc[k] = newTags[k];
return acc;
}, {});
nsiKeys.forEach(k => delete newTags[k]);
Object.assign(newTags, brand.tags, keepTags);
break;
// Replace the primary tags with what's in NSI ("amenity", "craft", "shop", "man_made", "route", etc)
nsiKeys.forEach(k => delete newTags[k]);
// Replace `wikidata`/`wikipedia` with e.g. `brand:wikidata`/`brand:wikipedia`
if (foundQID) {
delete newTags.wikipedia;
delete newTags.wikidata;
}
Object.assign(newTags, item.tags, keepTags);
break; // stop looking
}
}
}
// maybe someday: match features without the location to determine
// if a feature appears somewhere in the world that it shouldn't.
} // end if _nsi
// determine diff
const tagDiff = utilTagDiff(oldTags, newTags);
+3 -4
View File
@@ -17,11 +17,10 @@ export function validationSuspiciousName() {
// A concern here in switching to async data means that `_nsiFilters` will not
// be available at first, so the data on early tiles may not have tags validated fully.
fileFetcher.get('nsi_filters')
.then(filters => {
fileFetcher.get('nsi_generics')
.then(data => {
// known list of generic names (e.g. "bar")
_discardNameRegexes = filters.discardNames
.map(discardName => new RegExp(discardName, 'i'));
_discardNameRegexes = data.genericWords.map(pattern => new RegExp(pattern, 'i'));
})
.catch(() => { /* ignore */ });
+1 -2
View File
@@ -1,5 +1,5 @@
describe('iD.coreLocations', function() {
var locationManager, loco, wp;
var locationManager, loco;
var colorado = {
type: 'Feature',
@@ -27,7 +27,6 @@ describe('iD.coreLocations', function() {
// make a new one each time, so we aren't accidently testing the "global" locationManager
locationManager = iD.coreLocations();
loco = locationManager.loco();
wp = locationManager.wp();
});
+8 -2
View File
@@ -17,10 +17,16 @@ iD.fileFetcher.cache().preset_categories = {};
iD.fileFetcher.cache().preset_defaults = {};
iD.fileFetcher.cache().preset_fields = {};
iD.fileFetcher.cache().preset_presets = {};
// Initializing `coreContext` initializes `_validator`, which tries loading:
iD.fileFetcher.cache().deprecated = [];
iD.fileFetcher.cache().nsi_brands = [];
iD.fileFetcher.cache().nsi_filters = { discardNames: [] };
iD.fileFetcher.cache().nsi_presets = { presets: {} };
iD.fileFetcher.cache().nsi_data = { nsi: {} };
iD.fileFetcher.cache().nsi_features = { type: 'FeatureCollection', features: [] };
iD.fileFetcher.cache().nsi_generics = { genericWords: [] };
iD.fileFetcher.cache().nsi_replacements = { replacements: {} };
iD.fileFetcher.cache().nsi_trees = { trees: {} };
// Initializing `coreContext` initializes `_uploader`, which tries loading:
iD.fileFetcher.cache().discarded = {};
+2 -2
View File
@@ -2,11 +2,11 @@ describe('iD.validations.suspicious_name', function () {
var context;
before(function() {
iD.fileFetcher.cache().nsi_filters = { discardNames: ['^stores?$'] };
iD.fileFetcher.cache().nsi_generics = { genericWords: ['^stores?$'] };
});
after(function() {
iD.fileFetcher.cache().nsi_filters = { discardNames: [] };
iD.fileFetcher.cache().nsi_generics = { genericWords: [] };
});
beforeEach(function() {