Jump to content

User:Phlsph7/ListUnreferencedParagraphs.js

From Wikipedia, the free encyclopedia
Note: After saving, you have to bypass your browser's cache to see the changes. Google Chrome, Firefox, Microsoft Edge and Safari: Hold down the ⇧ Shift key and click the Reload toolbar button. For details and instructions about other browsers, see Wikipedia:Bypass your cache.
(function(){
	const scriptName = 'List Unreferenced Paragraphs';

	$.when(mw.loader.using('mediawiki.util'), $.ready).then(function(){
		const listPortletlink = mw.util.addPortletLink('p-tb', '#', scriptName, scriptName + 'Id');
		listPortletlink.onclick = function(e) {
			e.preventDefault();
			listUnreferencedParagraphs();
		};
		
		const highlightPortletlinkName = 'Highlight Unreferenced Paragraphs';
		const highlightPortletlink = mw.util.addPortletLink('p-tb', '#', highlightPortletlinkName, highlightPortletlinkName + 'Id');
		highlightPortletlink.onclick = function(e) {
			e.preventDefault();
			highlightUnreferencedParagraphs();
		};
	});
	
	function listUnreferencedParagraphs(){
		const timeout = 50;
		let stopProcessing = false;
		const content = document.getElementById('content');
		const contentContainer = content.parentElement;
		content.style.display = 'none';

		let scriptContainer = document.createElement('div');
		contentContainer.appendChild(scriptContainer);
		scriptContainer.outerHTML = `
	<div id="scriptContainer" style="display:flex; flex-direction: column;">
		<style>
			textarea {
				resize: none;
				padding: 5px;
			}
			button {
				margin: 5px;
			}
		</style>
		<h1>Unreferenced Paragraph Counter</h1>
		<div style="display:flex;">
			<div style="flex: 1; display:flex; flex-direction: column; margin: 5px;  height: 50vh; overflow-y: auto;">
				<label for="taList">Article Titles</label>
				<textarea id="taList" style="height: 100%;"></textarea>
			</div>
			<div style="flex: 2; display:flex; flex-direction: column; margin: 5px; height: 50vh; overflow-y: auto;">
				<label for="tableCounter">Overview table</label>
				<table id="tableCounter" class="wikitable" style="height: 100%; margin: 0px; width: 100%; border-collapse: collapse;">
					<thead>
						<tr>
							<th>Article title</th>
							<th title="paragraphs that require and lack references">Paragraphs without references</th>
							<th>Maintenance tags</th>
						</tr>
					</thead>
					<tbody id="tbodyCounter">
					
					</tbody>
				</table>
			</div>
		</div>
		<div style="display:flex; flex-direction: column">
			<div style="display:flex;">
				<button id="btStart" style="flex: 1;">Start</button>
				<button id="btStop" disabled style="flex: 1;">Stop</button>
				<button id="btCopy" style="flex: 1;">Copy</button>
			</div>
			<div>
				<button id="btClose" style="width: 100%;">Close</button>
			</div>
		</div>
	</div>
	`;
		const btStart = $('#btStart');
		btStart.click(function(){
			stopProcessing = false;
			btStart.prop("disabled", true);
			btStop.prop("disabled", false);
			
			let articleTitles = $('#taList').val().trim()
				.split('\r').join('')
				.split('\n');
				
			// remove duplicates
			articleTitles = [...new Set(articleTitles)];
			
			// populate table
			$("#tbodyCounter").empty();
			for(let i = 0; i < articleTitles.length; i++){
				let linkHTML = getLinkHTML(articleTitles[i]);
				let row = `<tr><td>${linkHTML}</td><td id="td_unref_${i}" style="text-align: center;">-</td><td id="td_tags_${i}"></td></tr>`;
				$("#tbodyCounter").append(row);
			}
				
			recursivelyProcessArticles(articleTitles, 0, timeout);
			
			function getLinkHTML(articleTitle) {
			    var link = document.createElement('a');
			    link.href = 'https://en.wikipedia.org/wiki/' + encodeURIComponent(articleTitle);
			    link.textContent = articleTitle;
			    return link.outerHTML;
			}
			
		});
		const btStop = $('#btStop');
		btStop.click(function(){
			stopProcessing = true;
			btStart.prop("disabled", false);
			btStop.prop("disabled", true);
		});
		const btCopy = $('#btCopy');
		btCopy.click(function(){
			const tableText =  getTextViaSelection();
			copyToClipboard(tableText);
			mw.notify("The table was copied to the clipboard.");
			
			function getTextViaSelection(){
				const tbodyCounter = $('#tbodyCounter')[0];
				const range = document.createRange();
				range.selectNodeContents(tbodyCounter);

				const selection = window.getSelection();
				selection.removeAllRanges();
				selection.addRange(range);
				return selection.toString();
			}
			
			function copyToClipboard(text) {
				const textarea = document.createElement('textarea');
				textarea.value = text;
				document.body.appendChild(textarea);
				textarea.select();
				document.execCommand('copy');
				document.body.removeChild(textarea);
			}
		});
		const btClose = $('#btClose');
		btClose.click(function(){
			btStop.trigger('click');
			let scriptContainer = document.getElementById('scriptContainer');
			scriptContainer.parentElement.removeChild(scriptContainer);
			content.style.display = '';
		});

		function recursivelyProcessArticles(articleTitles, index, timeout){
			if(!stopProcessing && index < articleTitles.length){
				btStop.text(`Stop (${index}/${articleTitles.length})`);
				const articleTitle = articleTitles[index];
				processArticle(articleTitles, index);
				
				setTimeout(function(){recursivelyProcessArticles(articleTitles, index + 1, timeout);}, timeout);
			}
			else{
				btStop.text(`Stop`);
				btStop.trigger('click');
			}
		}

		function processArticle(articleTitles, index){
			const articleTitle = articleTitles[index];
			const articleSearchTerm = encodeURIComponent(articleTitle);
			let wikiApiUrl = `https://en.wikipedia.org/w/api.php?action=parse&page=${articleSearchTerm}&format=json`;
			fetch(wikiApiUrl).then(async function(response) { // jshint ignore:line
				const data = await response.json();
				const cellUnrefId = `td_unref_${index}`;
				const cellTagsId = `td_tags_${index}`;
				if (data && data.parse && data.parse.text && data.parse.text['*']) {
					
					const articleHTML = data.parse.text['*'];
					const parser = new DOMParser();
					const doc = parser.parseFromString(articleHTML, 'text/html');
					const paragraphContainer = $(doc).find('.mw-parser-output').eq(0);
					
					const paragraphInfo = getParagraphInfo(paragraphContainer);
					const unreferencedParagraphs = paragraphInfo.unreferencedParagraphs;
					const includedParagraphs = paragraphInfo.includedParagraphs;
					
					//const count = `${unreferencedParagraphs.length} / ${includedParagraphs.length}`;
					const count = `${unreferencedParagraphs.length}`;
					$('#' + cellUnrefId).html(count);
					
					const maintenanceTagString = getMaintenanceTagString(paragraphContainer);
					$('#' + cellTagsId).html(maintenanceTagString);
				} else {
					$('#' + cellUnrefId).html('error');
					$('#' + cellTagId).html('error');
				}
			});
		}
		
		function getMaintenanceTagString(element){
			const templateOverview = {};
			const amboxes = getAmboxes(element);
			for(const ambox of amboxes){
				const amboxType = getAmboxTyp(ambox);
				updateOverview(templateOverview, amboxType);
			}

			const inlineTemplates = getInlineTemplates(element);
			for(const inlineTemplate of inlineTemplates){
				const inlineTemplateType = getInlineTemplateType(inlineTemplate);
				updateOverview(templateOverview, inlineTemplateType);
			}

			const overviewString = getOverviewString(templateOverview);
			return overviewString;

			function getInlineTemplates(element){
				return element.find('.Inline-Template').toArray();
			}

			function getInlineTemplateType(inlineTemplate){
				let innerText = inlineTemplate.innerText;
				let type = innerText.substring(1, innerText.length - 1);
				return type;
			}

			function getAmboxes(element){
				return element.find('.ambox').toArray();
			}

			function getAmboxTyp(ambox){
				for(const entry of ambox.classList){
					if(entry.substring(0,4) === 'box-'){
						return entry.substring(4).split('_').join(' ');
					}
				}
				
				return entry.innerText;
			}

			function updateOverview(overview, entry){
				if(Object.keys(overview).includes(entry)){
					overview[entry]++;
				}
				else{
					overview[entry] = 1;
				}
			}

			function getOverviewString(overview){
				let overviewString = '';
				const keys = Object.keys(overview);
				if(keys.length > 0){
					for(const key of keys){
						const count = overview[key];
						overviewString += count + 'x ';
						overviewString += key + ', ';
					}
					
					overviewString = overviewString.substring(0, overviewString.length - 2);
				}
				
				return overviewString;
			}
		}
	}
	
	function highlightUnreferencedParagraphs(){
		const paragraphContainer = $('#mw-content-text').find('.mw-parser-output').eq(0);
		const paragraphInfo = getParagraphInfo(paragraphContainer);
		const includedParagraphs = paragraphInfo.includedParagraphs;
		const unreferencedParagraphs = paragraphInfo.unreferencedParagraphs;

		for(let p of includedParagraphs){
			if(unreferencedParagraphs.includes(p)){
				p.style.background = '#faa';
			}
			else{
				p.style.background = '#afa';
			}
		}
		
		console.log(unreferencedParagraphs);
		mw.notify(`${unreferencedParagraphs.length} unreferenced paragraphs found`);
	}
	
	function getParagraphInfo(paragraphContainer){
		const minimalParagraphLength = 100;
		
		hideRefs(paragraphContainer[0]);
		
		combineMathBlocks(paragraphContainer.children().toArray());
		addElementsFollowingParagraphs(paragraphContainer.children().toArray());
		addElementsPrecedingParagraphs(paragraphContainer.children().toArray());
		
		showRefs(paragraphContainer[0]);
		
		const children = paragraphContainer.children();
		const releventChildren = [];
		for(let child of children){
			if(child.tagName.toLowerCase() === 'p'){
				releventChildren.push(child);
			}
			else if(child.classList.contains('mw-heading2')){
				releventChildren.push(child);
			}
		}
		
		const articleObject = convertToObject(releventChildren);
		removeIrrelevantSections(articleObject);
		const paragraphsInRelevantSections = convertToSimpleArray(articleObject);
		const includedParagraphs = removeShortParagraphs(paragraphsInRelevantSections);
		const unreferencedParagraphs = getUnreferencedParagraphs(includedParagraphs);

		return {
			'includedParagraphs': includedParagraphs,
			'unreferencedParagraphs': unreferencedParagraphs
		};
		
		function hideRefs(element){
			let refs = element.querySelectorAll('.reference, .Inline-Template');
			
			for(let ref of refs){
				ref.style.display = 'none';
			}
		}
		
		function showRefs(element){
			let refs = element.querySelectorAll('.reference, .Inline-Template');
			
			for(let ref of refs){
				ref.style.display = '';
			}
		}
		
		// includes the elements before and after a paragraph consisting only of a math formula into one element; this is based on the idea that the math formula artifically divides a single paragraph into parts
		function combineMathBlocks(elements){
			for(let i = 1; i < elements.length-1; i++){
				let previousElement = elements[i-1];
				let element = elements[i];
				let nextElement = elements[i+1];
				if(isMathBlock(elements[i])){
					previousElement.appendChild(element);
					previousElement.appendChild(nextElement);
				}
			}
			
			function isMathBlock(element){
				if(element.firstChild && element.firstChild.classList){
					if(element.firstChild.classList.contains('mwe-math-element')){
						if(element.innerText === element.firstChild.innerText){
							return true;
						}
					}
				}
				
				return false;
			}
		}
		
		// if the meaning of the passage does not end with the html paragraph then add the next element to it.
		function addElementsFollowingParagraphs(elements){
			for(let i = 0; i < elements.length-1; i++){
				let element = elements[i];
				let clone = element.cloneNode(true);
				removeStyleElements(clone);
				let innerText = clone.innerText.trim();
				if(element.tagName === 'P' && innerText.length > 0){
					let lastCharacter = innerText[innerText.length-1];
					const nonEndingCharacters = [',', ':'];
					if(nonEndingCharacters.includes(lastCharacter) || isLetter(lastCharacter)){
						let nextElement = elements[i+1];
						element.appendChild(nextElement);
						if(nextElement.tagName === 'STYLE' || nextElement.tagName === 'LINK'){
							if(i+2 < elements.length -1){
								let nextNextElement = elements[i+2];
								element.appendChild(nextNextElement);
							}
						}
					}
				}
			}
			
			function isLetter(character){
				return character.toLowerCase() !== character.toUpperCase();
			}
			
			function removeStyleElements(element){
				let styleElements = element.getElementsByTagName('style');
				for(const styleElement of styleElements){
					styleElement.remove();
				}
			}
		}
		
		// if a paragraph starts in the middle then add the previous element
		function addElementsPrecedingParagraphs(elements){
			for(let i = 1; i < elements.length; i++){
				let element = elements[i];
				let innerText = element.innerText.trim();
				if(element.tagName === 'P' && innerText.length > 0){
					let firstCharacter = innerText[0];
					if(isLowerCaseLetter(firstCharacter)){
						let previousElement = elements[i-1];
						element.insertBefore(previousElement, element.firstChild);
					}
				}
			}
			
			function isLowerCaseLetter(character){
				return character.toLowerCase() !== character.toUpperCase() && character === character.toLowerCase();
			}
		}

		function convertToObject(elementArray){
			const articleObject = {};
			let currentSection = "Lead";
			articleObject["Lead"] = []; // jshint ignore:line
			
			for(let element of elementArray){
				if(element.classList.contains('mw-heading2')){
					currentSection = element.innerText.split('[edit]').join('');
					articleObject[currentSection] = [];
				}
				else{
					articleObject[currentSection].push(element);
				}
			}
			
			return articleObject;
		}

		function removeIrrelevantSections(articleObject){
			const excludedSections = ['Lead', 'Plot', 'Plots', 'Plot summary', 'Plot synopsis', 'Synopsis', 'Storylines', 'Appearances', 'Further reading', 'See also', 'External links', 'References', 'Bibliography', 'Notes', 'Selected publications', 'Selected works', 'Cited sources', 'Sources', 'Footnotes'];
			for(let sectionName in articleObject){
				if(excludedSections.indexOf(sectionName) != -1){
					delete articleObject[sectionName];
				}
			}
		}

		function convertToSimpleArray(articleObject){
			let array = [];
			for (let sectionName in articleObject){
				array = array.concat(articleObject[sectionName]);
			}
			
			return array;
		}

		function removeShortParagraphs(paragraphArray){
			const longParagraphs = [];
			for(let paragraph of paragraphArray){
				if(paragraph.innerText.length >= minimalParagraphLength){
					longParagraphs.push(paragraph);
				}
			}
			
			return longParagraphs;
		}

		function getUnreferencedParagraphs(paragraphArray){
			const unreferencedParagraph = [];
			for(let paragraph of paragraphArray){
				if(isUnreferenced(paragraph)){
					unreferencedParagraph.push(paragraph);
				}
			}
			return unreferencedParagraph;
		}

		function isUnreferenced(paragraph){
			let hasRegularRef = $(paragraph).find('.reference').length > 0;
			let hasHarvRef = false;
			const links = $(paragraph).find('a').toArray();
			for(const link of links){
				let href = link.getAttribute('href');
				if(href && href.substring(0, 8) == '#CITEREF'){
					hasHarvRef = true;
				}
			}
			
			return !(hasRegularRef || hasHarvRef);
		}
	}
	
})();