WME PLN Core - Normalization Engine

Motor de lógica de normalización para WME Place Normalizer. No funciona por sí solo.
Tento skript by nemal byť nainštalovaný priamo. Je to knižnica pre ďalšie skripty, ktorú by mali používať cez meta príkaz // @require https://update.greasyforks.org/scripts/548747/1657858/WME%20PLN%20Core%20-%20Normalization%20Engine.js
Spýtať sa otázku, uverejniť recenziu, alebo nahlásiť skript.
Zalomiť riadky
// ==UserScript==
// @name         WME PLN Core - Normalization Engine
// @namespace    https://greasyforks.org/en/users/mincho77
// @version      9.0.0
// @description  Motor de lógica de normalización para WME Place Normalizer. No funciona por sí solo.
// @author       mincho77
// @license      MIT
// @grant        none

// ==/UserScript==

// Helper local (usa PLNCore.utils.escapeRegExp si existe)
function plnEscapeRegExpLocal(s){
    const f = (typeof PLNCore !== 'undefined' && PLNCore.utils && typeof PLNCore.utils.escapeRegExp === 'function')
        ? PLNCore.utils.escapeRegExp
        : (x => String(x).replace(/[.*+?^${}()|[\\]\\\\]/g, '\\$&'));
    return f(String(s));
}

function applySwapRules(originalName, deps) {
    try {
        const DBG = !!(window.__PLN_SWAP_DEBUG_ON || localStorage.getItem('wme_pln_debug_swap') === '1');
        let name = String(originalName || '');
        const swaps = (typeof plnCollectSwapRules === 'function')
            ? plnCollectSwapRules(deps)
            : (Array.isArray(deps?.swapWords) ? deps.swapWords : Array.isArray(window.swapWords) ? window.swapWords : []);

        if (DBG) plnLog('swap', 'applySwapRules', { originalName, swapsCount: Array.isArray(swaps) ? swaps.length : 0 });
        if (!swaps.length) { if (DBG) { plnLog('swap', 'skip: no swaps configured'); } return name; }

        const normalizeSpace = s => s.replace(/\s+/g, ' ').replace(/\s*-\s*/g, ' - ').trim();

        for (const raw of swaps) {
            if (!raw) { if (DBG) plnLog('swap', 'skip: null item'); continue; }
            const token = String((raw.word || raw.text || raw.token || '').trim());
            if (!token) { if (DBG) plnLog('swap', 'skip: empty token', raw); continue; }

            let where = String((raw.position || raw.where || raw.dir || raw.direction || '')).toLowerCase();
            if (where === 'antes' || where === 'before' || where === 'pre') where = 'before';
            if (where === 'despues' || where === 'después' || where === 'after' || where === 'post') where = 'after';
            if (where !== 'before' && where !== 'after') { if (DBG) plnLog('swap', `skip [${token}]: invalid position`, raw); continue; }

            const esc = token.replace(/[.*+?^${}()|[\]\\]/g, '\\$&').replace(/\s+/g, '\\s+');
            const SEP = '[\\s,.;:()\\[\\]\\-–—\\/]';

            const reFind = new RegExp(`(?:^|${SEP})${esc}(?=$|${SEP})`, 'iu');
            const reAnywhere = new RegExp(`(?:^|${SEP})${esc}(?=$|${SEP})`, 'giu');
            const reStart = new RegExp(`^\\s*${esc}(?=$|${SEP})`, 'iu');
            const reEnd = new RegExp(`(?:^|${SEP})${esc}\\s*$`, 'iu');

            if (DBG) plnLog('swap', `[${token}] → ${where}`);
            if (!reFind.test(name)) {
                if (DBG) { plnLog('swap', 'no-op: token not present in name', { name, token }); }
                continue;
            }

            if ((where === 'before' && reStart.test(name)) || (where === 'after' && reEnd.test(name))) {
                if (DBG) { plnLog('swap', 'no-op: already at target edge', { name }); }
                name = normalizeSpace(name);
                continue;
            }

            const before = name;
            name = name.replace(reAnywhere, ' ').replace(/\s{2,}/g, ' ').trim();
            name = where === 'before' ? `${token} ${name}`.trim() : `${name} ${token}`.trim();
            name = normalizeSpace(name);
            if (DBG) plnLog('swap', 'moved', { before, after: name });
        }

        if (DBG) { plnLog('swap', 'result =>', name); }
        return name;
    } catch (e) {
        if (window.__PLN_SWAP_DEBUG_ON) plnLog('error', '[PLN Swap] error', e);
        return originalName;
    }
}

function plnCollectSwapRules(deps) {
    try {
        const normDir = v => {
            v = String(v || '').toLowerCase();
            if (v === 'antes' || v === 'before' || v === 'pre' || v === 'start') return 'before';
            if (v === 'despues' || v === 'después' || v === 'after' || v === 'post' || v === 'end') return 'after';
            return null;
        };
        const key = s => String(s || '').normalize('NFD').replace(/[\u0300-\u036f]/g, '').toLowerCase().trim();
        const map = new Map();
        const setRule = (w, d, pri) => {
            w = String(w || '').trim();
            d = normDir(d);
            if (!w || !d) return;
            const k = key(w);
            const prev = map.get(k);
            if (!prev || (prev._pri || 0) <= pri) {
                map.set(k, { word: w, position: d, _pri: pri });
            }
        };
        (Array.isArray(deps?.swapWords) ? deps.swapWords : Array.isArray(window.swapWords) ? window.swapWords : []).forEach(x => {
            if (!x) return;
            if (typeof x === 'string') { setRule(x, 'before', 1); return; }
            const w = x.word || x.text || x.token || x.value || x.name;
            const d = x.position || x.where || x.dir || x.direction;
            setRule(w, d, 1);
        });
        const FORCED_DIR = { 'urbanizacion': 'after' };
        const FORCED_DISPLAY = { 'urbanizacion': 'Urbanización' };
        for (const [k, rec] of map.entries()) {
            const forced = FORCED_DIR[k];
            if (forced === 'after' || forced === 'before') rec.position = forced;
        }
        for (const fk in FORCED_DIR) {
            if (!map.has(fk)) {
                map.set(fk, { word: FORCED_DISPLAY[fk] || fk, position: FORCED_DIR[fk], _pri: 999 });
            }
        }
        const arr = Array.from(map.values()).map(({ word, position }) => ({ word, position }));
        arr.sort((a, b) => b.word.length - a.word.length);
        return arr;
    } catch (e) { return []; }
}

function processPlaceName(originalName, deps) {
    let processedName = (originalName || '').trim();
    const exclusions = new Map();
    let placeholderIndex = 0;
    const EXC  = deps?.excludedWords  ?? window.excludedWords;
    const EXCM = deps?.excludedWordsMap ?? window.excludedWordsMap;
    const REPL = deps?.replacementWords ?? (typeof window.replacementWords === 'object' ? window.replacementWords : {});
    const SKIP = deps?.skipGeneralReplacements ?? (typeof window.skipGeneralReplacements === 'boolean' ? window.skipGeneralReplacements : false);
    const DICT = deps?.dictionaryWords ?? window.dictionaryWords;

    if (EXC && EXC.size > 0) {
        const sortedExclusions = Array.from(EXC).sort((a, b) => b.length - a.length);
        sortedExclusions.forEach(phrase => {
            if (!phrase) return;
            const escapedPhrase = phrase.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
            const regex = new RegExp(`\\b${escapedPhrase}\\b`, 'gi');
            processedName = processedName.replace(regex, (match) => {
                const placeholder = `__PLN_EXCLUDED_${placeholderIndex}__`;
                exclusions.set(placeholder, phrase);
                placeholderIndex++;
                return placeholder;
            });
        });
    }

    processedName = processedName.replace(/\|/g, ' - ');
    processedName = processedName.replace(/\s{2,}/g, ' ').trim();
    const words = processedName.split(/\s+/).filter(word => word.length > 0);
    const commonWords = ['es', 'de', 'del', 'el', 'la', 'los', 'las', 'y', 'e', 'o', 'u', 'un', 'una', 'unos', 'unas', 'a', 'en', 'con', 'tras', 'por', 'al', 'lo'];

    const normalizedWords = words.map((word, index) => {
        if (word.startsWith('__PLN_EXCLUDED_')) {
            return word;
        }
        if (word === '-') return '-';
        const lower = (word || '').toLowerCase();
        if (commonWords.includes(lower)) {
            const prevIsHyphen = index > 0 && words[index - 1] === '-';
            const prevIsOpenParen = index > 0 && words[index - 1] === '(';
            if (index === 0 || prevIsHyphen || prevIsOpenParen) {
                return lower.charAt(0).toUpperCase() + lower.slice(1);
            }
            return lower;
        }
        return normalizeWordInternal(word, index === 0, false, { EXC, EXCM, REPL, SKIP, DICT });
    });
    processedName = normalizedWords.join(" ");

    processedName = aplicarReglasEspecialesNombre(processedName, { EXC, EXCM, REPL, SKIP, DICT });
    processedName = postProcessQuotesAndParentheses(processedName);

    if (REPL && typeof REPL === 'object' && Object.keys(REPL).length > 0) {
        processedName = aplicarReemplazosDefinidos(processedName, REPL);
    }
    processedName = aplicarReemplazosGenerales(processedName, { SKIP });

    exclusions.forEach((originalPhrase, placeholder) => {
        processedName = processedName.replace(placeholder, originalPhrase);
    });

    let finalName = processedName.replace(/\s{2,}/g, ' ').trim();
    finalName = finalName.replace(/\s*-\s*$/, '');
    if (finalName.endsWith('.')) {
        finalName = finalName.slice(0, -1);
    }
    return finalName;
}

function normalizePlaceName(word) {
    if (!word || typeof word !== "string") return "";
    if (word.includes("/")) {
        if (word === "/") return "/";
        return word.split("/").map(part => normalizePlaceName(part.trim())).join("/");
    }
    if (/^[0-9]+$/.test(word)) return word;
    word = word.replace(/(\d)([a-zA-Z])/g, (_, num, letter) => `${num}${letter.toUpperCase()}`);
    const romanRegexStrict = /^(C{0,3}(XC|XL|L?X{0,3})?(IX|IV|V?I{0,3})?)$/i;
    if (romanRegexStrict.test(word)) return word.toUpperCase();
    if (/^[A-ZÁÉÍÓÚÑ0-9.]+$/.test(word) && word.length > 1 && (word.includes('.') || /^[A-ZÁÉÍÓÚÑ]+$/.test(word))) {
        if (word.toUpperCase() === "DI" || word.toUpperCase() === "SI") return word.charAt(0).toUpperCase() + word.slice(1).toLowerCase();
        return word;
    }
    return word.charAt(0).toUpperCase() + word.slice(1).toLowerCase();
}

function normalizeWordInternal(word, isFirstWordInSequence = false, isInsideQuotesOrParentheses = false, deps) {
    if (!word || typeof word !== 'string') return "";
    const lowerWord = word.toLowerCase();
    if (deps?.DICT && deps.DICT.has(lowerWord)) {
        const originalDictEntry = Array.from(deps.DICT).find(w => w.toLowerCase() === lowerWord);
        if (originalDictEntry) return originalDictEntry;
    }
    if (deps?.EXC && deps?.EXCM) {
        const cleanedInputWord = removeDiacritics(word.toLowerCase());
        const firstChar = word.charAt(0).toLowerCase();
        const excludedCandidates = deps.EXCM.get(firstChar);
        if (excludedCandidates) {
            for (const excludedWord of excludedCandidates) {
                if (removeDiacritics(excludedWord.toLowerCase()) === cleanedInputWord) return excludedWord;
            }
        }
    }
    if (word.includes('-') && /\p{L}-\p{L}/u.test(word)) {
        return word.split('-').map(part => /^[A-ZÁÉÍÓÚÑ0-9.]+$/.test(part) && part.length > 1 ? part : part.charAt(0).toUpperCase() + part.slice(1).toLowerCase()).join('-');
    }
    if (word.includes("'")) return handleApostropheWord(word);
    if (/^[A-ZÁÉÍÓÚÑ0-9.&]+$/.test(word) && word.length > 1) return word;
    if (/^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$/i.test(word)) return word.toUpperCase();
    const commonWords = ['es', 'de', 'del', 'el', 'la', 'los', 'las', 'y', 'e', 'o', 'u', 'un', 'una', 'unos', 'unas', 'a', 'en', 'con', 'tras', 'por', 'al', 'lo'];
    const lowerWordForCommonCheck = word.toLowerCase().replace('.', '');
    if (commonWords.includes(lowerWordForCommonCheck)) {
        if (lowerWordForCommonCheck === "y") return isFirstWordInSequence ? "Y" : "y";
        if (lowerWordForCommonCheck === "e") return isFirstWordInSequence ? "E" : "e";
        if (isFirstWordInSequence && !isInsideQuotesOrParentheses) return lowerWordForCommonCheck.charAt(0).toUpperCase() + lowerWordForCommonCheck.slice(1);
        return lowerWordForCommonCheck;
    }
    let wordWithoutPunctuation = word.endsWith('.') ? word.slice(0, -1) : word;
    return wordWithoutPunctuation.charAt(0).toUpperCase() + wordWithoutPunctuation.slice(1).toLowerCase();
}

function aplicarReemplazosGenerales(name, deps) {
    if (deps?.SKIP) return name;
    name = removeEmoticons(name);
    const reglas = [
        { buscar: /\|/g, reemplazar: " - " },
        { buscar: /\s*\/\s*/g, reemplazar: " - " },
        { buscar: /\[[Pp]\]/g, reemplazar: "" },
        { buscar: /(\p{L}|\p{N})\s*-\s*(\p{L}|\p{N})/gu, reemplazar: "$1 - $2" },
        { buscar: /\s*-\s*/g, reemplazar: " - " },
        { buscar: /\s{2,}/g, reemplazar: ' ' },
    ];
    reglas.forEach(regla => { name = name.replace(regla.buscar, regla.reemplazar); });
    name = name.replace(/\s{2,}/g, ' ').trim();
    name = name.replace(/\s*-\s*-\s*/g, ' - ');
    name = name.replace(/--/g, '-');
    return name;
}

function aplicarReglasEspecialesNombre(newName, deps) {
    newName = newName.replace(/-(\s*)([^\s]+)/g, (match, spaces, nextWord) => `-${spaces}${normalizeWordInternal(nextWord, true, false, deps)}`);
    newName = newName.replace(/\.\s+([a-z])/g, (match, letter) => `. ${letter.toUpperCase()}`);
    newName = newName.replace(/(\(\s*)([a-zA-Z])/g, (match, P1, P2) => P1 + P2.toUpperCase());
    newName = newName.replace(/\s([a-zA-Z])$/, (match, letter) => ` ${letter.toUpperCase()}`);
    return newName.replace(/\s{2,}/g, ' ').trim();
}

function aplicarReemplazosDefinidos(text, replacementRules) {
    let newText = text;
    if (typeof replacementRules !== 'object' || replacementRules === null || Object.keys(replacementRules).length === 0) return newText;
    const sortedFromKeys = Object.keys(replacementRules).sort((a, b) => b.length - a.length);
    for (const fromKey of sortedFromKeys) {
        const toValue = replacementRules[fromKey];
        const escapedFromKey = plnEscapeRegExpLocal(String(fromKey));
        let regex;
        const wordCharSet = '[\\p{L}\\p{N}_-]';
        if (toValue.endsWith(' -')) {
            regex = new RegExp(`(^|[^\\p{L}\\p{N}_\\-])(${escapedFromKey})(\\s+)(${wordCharSet}+)?(?=$|[^\\p{L}\\p{N}_-])`, 'giu');
        } else {
            regex = new RegExp(`(^|[^\\p{L}\\p{N}_-])(${escapedFromKey})(?=$|[^\\p{L}\\p{N}_-])`, 'giu');
        }
        newText = newText.replace(regex, (match, ...args) => {
            const originalString = args[args.length - 1];
            const offset = args[args.length - 2];
            let delimitadorPrevio, matchedFromKey, capturedSpaces, nextWordIfCaptured;
            if (toValue.endsWith(' -')) {
                [delimitadorPrevio, matchedFromKey, capturedSpaces, nextWordIfCaptured] = args;
            } else {
                [delimitadorPrevio, matchedFromKey] = args;
            }
            if (toValue.endsWith(' -')) {
                return delimitadorPrevio + toValue + (nextWordIfCaptured || '');
            }
            return delimitadorPrevio + toValue;
        });
    }
    return newText;
}

function isExcludedWord(word, deps) {
    if (!word || !(deps?.EXC || window.excludedWords)) return null;
    const clean = w => w.trim().toLowerCase().normalize("NFD").replace(/[\u0300-\u036f]/g, "");
    const cleanedWord = clean(word);
    for (const excl of (deps?.EXC || window.excludedWords)) {
        if (clean(excl) === cleanedWord) {
            return excl;
        }
    }
    return null;
}

function plnApplyExclusions(str, deps) {
    try {
        const reWord = /([\p{L}\p{M}][\p{L}\p{M}\.'’]*)/gu;
        return String(str || '').replace(reWord, (m) => {
            try {
                const excl = typeof isExcludedWord === 'function' ? isExcludedWord(m, deps) : null;
                return excl ? excl : m;
            } catch (_) { return m; }
        });
    } catch (_) { return String(str || ''); }
}

function handleApostropheWord(word) {
    const parts = word.split("'");
    if (parts.length === 2) {
        const [before, after] = parts;
        if (after.toLowerCase() === 's') {
            return before + "'s";
        } else {
            return before + "'" + (after.charAt(0).toUpperCase() + after.slice(1).toLowerCase());
        }
    }
    return word;
}

function postProcessQuotesAndParentheses(text) {
    if (typeof text !== 'string') return text;
    const capitalizeFirstLetter = (string) => !string ? string : string.charAt(0).toUpperCase() + string.slice(1);
    text = text.replace(/"([^"]*)"/g, (match, content) => `"${capitalizeFirstLetter(content.trim())}"`);
    text = text.replace(/\(([^)]*)\)/g, (match, content) => `(${capitalizeFirstLetter(content.trim())})`);
    return text.replace(/\s+/g, ' ').trim();
}

function removeEmoticons(text) {
    if (!text || typeof text !== 'string') return '';
    const emojiRegex = /[\p{Emoji_Presentation}\p{Extended_Pictographic}]/gu;
    return text.replace(emojiRegex, '').trim().replace(/\s{2,}/g, ' ');
}

function isValidExcludedWord(newWord, deps) {
    if (!newWord) return { valid: false, msg: "La palabra no puede estar vacía." };
    const lowerNewWord = newWord.toLowerCase();
    const DICT = deps?.dictionaryWords || window.dictionaryWords;
    const EXC  = deps?.excludedWords   || window.excludedWords;
    const EXCM = deps?.excludedWordsMap|| window.excludedWordsMap;
    if (newWord.length === 1) return { valid: false, msg: "No se permite agregar palabras de un solo caracter." };
    if (/[-'\s]/.test(newWord)) return { valid: true };
    if (/^[^a-zA-Z0-9áéíóúÁÉÍÓÚñÑ]+$/.test(newWord)) return { valid: false, msg: "No se permite agregar solo caracteres especiales." };
    if (DICT) {
        if (Array.from(DICT).some(w => w.toLowerCase() === lowerNewWord)) {
            return { valid: false, msg: "La palabra ya existe en el diccionario (sin considerar mayúsculas/minúsculas). No se puede agregar a especiales." };
        }
        if (Array.from(DICT).some(w => w === newWord)) {
            return { valid: false, msg: "La palabra (con esta capitalización exacta) ya existe en el diccionario. No se puede agregar a especiales." };
        }
    }
    const commonWords = ['es', 'de', 'del', 'el', 'la', 'los', 'las', 'y', 'e', 'o', 'u', 'un', 'una', 'unos', 'unas', 'a', 'en', 'con', 'tras', 'por', 'al', 'lo'];
    if (commonWords.includes(lowerNewWord)) return { valid: false, msg: "Esa palabra es muy común y no debe agregarse a la lista." };
    if (EXC) {
        if (EXC.has(newWord)) return { valid: false, msg: "La palabra (con esta capitalización exacta) ya está en la lista." };
        if (EXCM) {
            const firstChar = lowerNewWord.charAt(0);
            const candidatesForFirstChar = EXCM.get(firstChar);
            if (candidatesForFirstChar) {
                for (const existingWord of candidatesForFirstChar) {
                    if (existingWord.toLowerCase() === lowerNewWord) {
                        return { valid: false, msg: "Esta palabra ya existe en la lista (con diferente capitalización)." };
                    }
                }
            }
        }
    }
    return { valid: true };
}