User:GeneralNotability/edit-filter-hit-analyzer.js

From Wikipedia, the free encyclopedia
Note: After saving, you have to bypass your browser's cache to see the changes. Google Chrome, Firefox, Microsoft Edge and Safari: Hold down the ⇧ Shift key and click the Reload toolbar button. For details and instructions about other browsers, see Wikipedia:Bypass your cache.
// <nowiki>
// @ts-check
// More information on how an edit filter was tripped

importStylesheet('w:en:User:GeneralNotability/edit-filter-hit-analyzer.css' );
/**
 * @typedef EditFilterLine
 * @type {Object}
 * @property {string} text Text of the line
 * @property {string} normedText Text with some modifications applied for parsing
 * @property {string[]} variables Variables found in the line
 * @property {number} indentation how far to indent the line
 */

const efa_knownVars = {};

const efa_PAGE_NAME_RE = /Special:AbuseLog\/\d+/;
const efa_FILTER_PAGE_RE = /\/wiki\/(Special:AbuseFilter\/(\d+))/;
// Vars in this list shouldn't have their full content displayed because they're usually really big
const efa_HIDDEN_VARS = [ 'old_wikitext', 'new_wikitext', 'edit_diff', 'all_links', 'added_lines', 'removed_lines', 'new_html' ];

// Parser regexes
const efa_REGEX_ASSIGNMENT_RE = /(\w+)\s*:=\s*"(.*)"/;
const efa_RLIKE_RE = /\b(.*)\s*(i?rlike|regex)\s*(\w+|".*")/;

// Future reference: if we want to do ccnorm ourselves, we can pull the conversion list
// from https://phab.wmfusercontent.org/file/data/jcoued3dziiwwwdr53lp/PHID-FILE-lkxia6juxnhqt263dbrj/equivset.json

async function efa_main() {
	// populate knownVars with built-in values
	Object.entries(mw.config.get('wgAbuseFilterVariables')).forEach(([ key, value ]) => {
		efa_knownVars[key] = value;
	});

	const $actionParams = $('h3:contains("Action parameters")', document);
	$('<h3>').text('Filter rule analysis').insertBefore($actionParams);
	const $ruleAnchor = $('<ul>').attr('id', 'efa-anchor').insertBefore($actionParams);
	// Find the link which goes to Special:AbuseFilter, then pull out the wikilink part of it
	const filterId = $('a', document).filter(function () {
		return efa_FILTER_PAGE_RE.test(this.getAttribute('href'));
	}).attr('href').match(efa_FILTER_PAGE_RE)[2];
	const filterPattern = await efa_getFilter(filterId);
	if (!filterPattern) {
		// Something went wrong (or we can't access the filter),
		// bail out
		return;
	}
	const filterRules = efa_parseRules(filterPattern);
	filterRules.forEach((rule) => {
		const $bullet = $('<li>').attr('style', 'margin-left:' + (10 * rule.indentation + 10) + 'px;');
		$('<span>').addClass('efa-rule').text(rule.text).appendTo($bullet);
		rule.variables.forEach((variable) => {
			const $efaData = $('<span>').addClass('efa-data');
			if (efa_HIDDEN_VARS.includes(variable)) {
				$efaData.append(variable + ': (not shown)');
			} else {
				$efaData.append(variable + ': ' + efa_knownVars[variable]);
			}
			$efaData.appendTo($bullet);
		});

		const rlikeMatch = rule.normedText.match(efa_RLIKE_RE);
		if (rlikeMatch) {
			// If this is a regex, try to expand it and generate a link
			let reText = rlikeMatch[1];
			const matchType = rlikeMatch[2];
			let re = rlikeMatch[3];
			// Whether to apply substitution on the regex side (don't if )
			let subRe = true;
			const reQuoteSearch = re.match(/.*?"(.*)"/);
			if (reQuoteSearch) {
				// Remove the quotes around a literal regex
				re = reQuoteSearch[1];
				// Don't attempt substitution since this is a literal
				subRe = false;
			}
			// Expand variables (or possibly function calls on a variable)
			// TODO: this is really simplistic (obviously) - strip function calls and get
			// an exact match
			for (const entry of Object.entries(efa_knownVars)) {
				if (reText.includes(entry[0])) {
					reText = entry[1].toString();
				}
				if (re.includes(entry[0]) && subRe) {
					re = entry[1].toString();
				}
			}
			// abusefilter entries are PCRE and by default use the 'u' flag.
			// if irlike is being used, add the i flag as well.
			let flags = 'u';
			if (matchType === 'irlike') {
				flags += 'i';
			}
			const re101url = `https://regex101.com/?regex=${encodeURIComponent(re)}&testString=${encodeURIComponent(reText)}&flags=${flags}`;
			$bullet.append(' ').append($('<a>').attr('href', re101url).text('(view at regex101)'));
		}
		$bullet.appendTo($ruleAnchor);
	});
}

/**
 * Turn a filter's pattern into a list of rules
 *
 * @param {string} pattern Original text pattern
 *
 * @return {EditFilterLine[]} List of rules
 */
function efa_parseRules(pattern) {
	// Strip all newline characters and split by statement
	// The second part is taken from https://stackoverflow.com/questions/11502598/how-to-match-something-with-regex-that-is-not-between-two-special-characters
	// It matches all split characters (&, ;, &) as long as they are _not_ between quotes
	const filterLines = pattern.replace(/(\r|\n)/g, '').split(/([&;|](?=(?:[^"]*"[^"]*")*[^"]*$))/g);
	/** @type {EditFilterLine[]} */
	const annotatedFilterLines = [];
	filterLines.forEach((line) => {
		// Trim, then replace long whitespaces with a single space
		const cleanedUpLine = line.trim().replace(/\s+/, ' ');
		const annotatedLine = { text: cleanedUpLine, normedText: cleanedUpLine,
			variables: [], indentation: 0 };
		annotatedFilterLines.push(annotatedLine);
	});

	// Indentation pass: figure out how deep each statement is nested in parens,
	// then create a "normed" version which strips the extra paren(s)
	// While we're in there, save variable assignments
	let indent = 0;
	annotatedFilterLines.forEach((line) => {
		const openParens = line.text.split(/\(/).length;
		const closeParens = line.text.split(/\)/).length;
		// Because of how we split the strings, a block of indented text will
		// always start with an extra open paren on the starting rule, and close
		// with an extra one on the ending rule (but we want both of those lines)
		// indented
		const deltaParens = openParens - closeParens;
		if (deltaParens > 0) {
			indent += deltaParens;
			line.indentation = indent;
			// Remove the extra paren from the normed text
			line.normedText = line.text.replace('(', '');
		} else if (deltaParens < 0) {
			line.indentation = indent;
			indent += deltaParens; // Remember, deltaparens is negative here, so add it
			// Remove the extra paren from the normed text
			line.normedText = line.text.replace(/\)(?=[^)]*$)/, '');
		} else {
			line.indentation = indent;
		}
		const varAssignment = line.normedText.match(efa_REGEX_ASSIGNMENT_RE);
		if (varAssignment) {
			efa_knownVars[varAssignment[1]] = varAssignment[2];
		}
	});

	// Annotate by going through and identifying variables used in the lines
	Object.keys(efa_knownVars).forEach((varName) => {
		const varRe = new RegExp('\\b' + varName + '\\b');
		annotatedFilterLines.forEach((line) => {
			if (line.text.match(varRe)) {
				const assignmentMatch = line.text.match(efa_REGEX_ASSIGNMENT_RE);
				if (assignmentMatch && assignmentMatch[1] === varName) {
					// Don't list the variable on the line that assigns
					return;
				}
				line.variables.push(varName);
			}
		});
	});
	return annotatedFilterLines;
}

async function efa_getFilter(filterId) {
	try {
		const api = new mw.Api();
		const response = await api.get({
			action: 'query',
			list: 'abusefilters',
			abfstartid: filterId,
			abfendid: filterId,
			abfprop: 'pattern'
		});
		if (response.query.abusefilters.length < 1) {
			// No match?
			return '';
		}
		return response.query.abusefilters[0].pattern;

	} catch (error) {
		console.log(error);
		return '';
	}
}

// On document load, check if this page is a edit filter hit - if so,
// load the EF stuff
$(function () {
	if (efa_PAGE_NAME_RE.test(mw.config.get('wgPageName'))) {
		efa_main();
	}
});
// </nowiki>