User:Harej/citation-watchlist-staging.js

Code that you insert on this page could contain malicious content capable of compromising your account. If you import a script from another page with "importScript", "mw.loader.load", "iusc", or "lusc", take note that this causes you to dynamically load a remote script, which could be changed by others. Editors are responsible for all edits and actions they perform, including by scripts. User scripts are not centrally supported and may malfunction or become inoperable due to software changes. A guide to help you find broken scripts is available. If you are unsure whether code you are adding to this page is safe, you can ask at the appropriate village pump.
This code will be executed when previewing this page.
Documentation for this user script can be added at User:Harej/citation-watchlist-staging.
Note: After saving, you have to bypass your browser's cache to see the changes. Google Chrome, Firefox, Microsoft Edge and Safari: Hold down the ⇧ Shift key and click the Reload toolbar button. For details and instructions about other browsers, see Wikipedia:Bypass your cache.
/* Per-wiki configuration */

const LANGUAGE = 'test';
const FAMILY = 'wikipedia';
const actionApiEndpoint = `https://${LANGUAGE}.${FAMILY}.org/w/api.php`;
const restApiEndpoint = `https://api.wikimedia.org/core/v1`;
const publicSuffixList = "Wikipedia:Citation_Watchlist/Public_Suffix_List";
const listOfLists = "Wikipedia:Citation_Watchlist/Lists";
const msgWarning = "Warning";
const msgCaution = "Caution";
const msgInspect = "Inspect";
const warnEmoji = '\u2757';
const cautionEmoji = '\u270B';
const inspectEmoji = '\uD83D\uDD0E';
const warnSectionHeader = "==Warn==";
const cautionSectionHeader = "==Caution==";
const inspectSectionHeader = "==Inspect==";
const delayMs = 100;


/*
Citation Watchlist Script – Highlights watchlist entries when questionable sources are added

author:  Hacks/Hackers
license: GPL 4.0
*/

let publicSuffixSet = new Set();
let warnList = new Set();
let cautionList = new Set();
let inspectList = new Set();
let lastRequestTime = 0;

function prependEmojiWithTooltip(element, emoji, domains, tooltipText) {
  let processedType = '';
  if (emoji === warnEmoji) {
    processedType = 'warn';
  } else if (emoji === cautionEmoji) {
    processedType = 'caution';
  } else if (emoji === inspectEmoji) {
    processedType = 'inspect';
  } else {
    console.error('Unsupported emoji type');
    return;
  }

  if (element.getAttribute(`data-processed-${processedType}`) === 'true') {
    return;
  }

  const emojiSpan = document.createElement('span');
  emojiSpan.textContent = emoji + " ";
  emojiSpan.title = tooltipText + ": " + domains.join(", ");
  element.parentNode.insertBefore(emojiSpan, element);
  element.setAttribute(`data-processed-${processedType}`, 'true');
}

async function parseWatchlist() {
  // Select all containers of the watchlist links to process them individually
  const entriesContainers = document.querySelectorAll('.mw-changeslist-links');
  const revisions = [];
  const revisionIds = [];

  let linkCounter = 0;

  // Build map of previous revision IDs
  for (const container of entriesContainers) {
    const prevLink = container.querySelector('a.mw-history-histlinks-previous');
    let urlParams = '';
    if (prevLink) {
      urlParams = new URLSearchParams(prevLink.href);
      revisionIds.push(urlParams.get('oldid'));
    }
  }
  const previousRevisionMap = await fetchPreviousRevisionIds(revisionIds);

  for (const container of entriesContainers) {
    const diffLink = container.querySelector('a.mw-changeslist-diff');
    const histLink = container.querySelector('a.mw-changeslist-history');
    const prevLink = container.querySelector('a.mw-history-histlinks-previous');
    const curLink = container.querySelector('a.mw-history-histlinks-current');

    if (diffLink) {
      // First we are checking if we are in recent changes / watchlist.
      // If a "diff" link is found, process it
      linkCounter += 1;
      urlParams = new URLSearchParams(diffLink.href);
      revisions.push({
        oldrevision: urlParams.get('diff'),
        newrevision: urlParams.get('oldid'),
        element: diffLink.parentNode.parentNode
      });
    } else if (histLink) {
      // If no "diff" link is found but a "hist" link is, process the "hist" link
      linkCounter += 1;
      urlParams = new URLSearchParams(histLink.href);
      const pageID = urlParams.get('curid');
      const firstID = await fetchFirstRevisionId(pageID);
      revisions.push({
        oldrevision: firstID,
        element: histLink.parentNode.parentNode
      });
    } else if (prevLink) {
      // At this point, check if we are on a page history rather than watchlist
      linkCounter += 1;
      urlParams = new URLSearchParams(prevLink.href);
      revisions.push({
        oldrevision: urlParams.get('oldid'),
        newrevision: previousRevisionMap[urlParams.get('oldid')],
        element: prevLink.parentNode.parentNode
      });
    } else if (curLink) {
      // No prev link means we are at the page's first revision
      // We do not actually want to compare to the current revision. We extract
      // the oldid and treat like a new page.
      linkCounter += 1;
      urlParams = new URLSearchParams(curLink.href);
      revisions.push({
        oldrevision: urlParams.get('oldid'),
        element: curLink.parentNode.parentNode
      });
    }
  }

  // Finally, to get to this point, you are on a page history with only
  // one revision, and therefore no links of any kind. Extract first (and
  // only) revision ID from page title.
  if (linkCounter == 0) {
    const pageID = mw.config.get('wgArticleId');
    const firstID = await fetchFirstRevisionId(pageID);
    revisions.push({
      oldrevision: firstID,
      element: entriesContainers[0]
    });
  }

  return revisions;
}

function delay(ms) {
  return new Promise(resolve => setTimeout(resolve, ms));
}

async function buildURL(params) {
  const url = new URL(actionApiEndpoint);
  Object.keys(params).forEach(key => url.searchParams.append(key, params[key]));
  return url;
}

function getRootDomain(hostname, publicSuffixSet) {
  const domainParts = hostname.split('.');
  for (let i = 0; i < domainParts.length; i++) {
    const candidate = domainParts.slice(i).join('.');
    if (publicSuffixSet.has(candidate) || publicSuffixSet.has(`!${candidate}`)) {
      return domainParts.slice(i - 1).join('.');
    }
  }
  return hostname;
}

function extractAddedURLs(addedParts) {
  const addedURLs = [];
  const urlRegex = /https?:\/\/[^\s<"]+/g;
  let match;
  while ((match = urlRegex.exec(addedParts)) !== null) {
    addedURLs.push(match[0]);
  }
  return addedURLs;
}

async function fetchFromActionAPI(params) {
  const url = await buildURL(params);
  console.log(`Action API request: ${url}`);

  const now = Date.now();
  const elapsed = now - lastRequestTime;
  if (elapsed < delayMs) {
    await delay(delayMs - elapsed);
  }

  lastRequestTime = Date.now();

  try {
    const response = await fetch(url);
    if (!response.ok) {
      throw new Error(`Network response was not ok: ${response.statusText}`);
    }
    return await response.json();
  } catch (error) {
    console.error('Error fetching data from MediaWiki API:', error);
    throw error;
  }
}

async function fetchPublicSuffixList() {
  const pslUrl = `https://${LANGUAGE}.${FAMILY}.org/wiki/${publicSuffixList}?action=raw`
  try {
    const response = await fetch(pslUrl);
    const content = await response.text();
    const suffixSet = new Set();
    const lines = content.split('\n');
    for (const line of lines) {
      if (line.trim() && !line.trim().startsWith('//')) {
        suffixSet.add(line.trim());
      }
    }
    return suffixSet;
  } catch (error) {
    console.error("Error fetching Public Suffix List:", error);
    return new Set();
  }
}

async function fetchDiffFromAPI(apiUrl) {
  console.log(`Diff API request: ${apiUrl}`);
  try {
    const response = await fetch(apiUrl);
    const data = await response.json();
    return data["source"] || data["diff"];
  } catch (error) {
    console.error('Error fetching API content:', error);
    return null;
  }
}

async function fetchDiffAndProcess(revisions) {
  for (const revision of revisions) {
    let apiUrl = `${restApiEndpoint}/${FAMILY}/${LANGUAGE}/revision/${revision.oldrevision}`;
    if (revision.newrevision !== undefined) {
      apiUrl += `/compare/${revision.newrevision}`;
    }
    const diff = await fetchDiffFromAPI(apiUrl);
    let addedURLs = [];

    if (Array.isArray(diff)) { // actual diffs are arrays; new pages are strings
      // Types 2 and 4 represent "from".
      // Types 1 and 5 represent "to".
      // Type 3 represents changes within a line. It will be harder to extract URL changes in this case.
      let fromURLs = [];
      let toURLs = [];

      for (const diffLine of diff) {
        const lineURLs = extractAddedURLs(diffLine.text);
        for (const URL of lineURLs) {
          if (diffLine.type === 2 || diffLine.type === 4) {
            fromURLs.push(URL);
          } else if (diffLine.type === 1 || diffLine.type === 5) {
            toURLs.push(URL);
          }
        }
      }

      const toURLSet = new Set(toURLs);
      addedURLs = fromURLs.filter(url => !toURLSet.has(url));
    } else {
      addedURLs = extractAddedURLs(diff);
    }

    console.log(`Old revision: ${revision.oldrevision}
    New revision: ${revision.newrevision}
    API URL: ${apiUrl}
    Revision element: ${revision.element.innerHTML}
    Added URLs: ${addedURLs.join(' ')}
    `);

    const matchedWarnDomains = [];
    const matchedCautionDomains = [];
    const matchedInspectDomains = [];

    for (const url of addedURLs) {
      const hostname = new URL(url).hostname;
      const domain = getRootDomain(hostname, publicSuffixSet);

      if (warnList.has(domain) && !matchedWarnDomains.includes(domain)) {
        matchedWarnDomains.push(domain);
      } else if (cautionList.has(domain) && !matchedCautionDomains.includes(domain)) {
        matchedCautionDomains.push(domain);
      } else if (inspectList.has(domain) && !matchedInspectDomains.includes(domain)) {
        matchedInspectDomains.push(domain);
      }
    }

    if (matchedWarnDomains.length > 0) {
      prependEmojiWithTooltip(revision.element, warnEmoji, matchedWarnDomains, msgWarning);
    }
    if (matchedCautionDomains.length > 0) {
      prependEmojiWithTooltip(revision.element, cautionEmoji, matchedCautionDomains, msgCaution);
    }
    if (matchedInspectDomains.length > 0) {
      prependEmojiWithTooltip(revision.element, inspectEmoji, matchedInspectDomains, msgInspect);
    }
  }
}

async function fetchAndOrganizeDomainLists(pageName) {
  const params = {
    action: 'query',
    prop: 'revisions',
    titles: pageName,
    rvprop: 'content',
    rvslots: '*',
    format: 'json',
    origin: '*'
  };

  try {
    const data = await fetchFromActionAPI(params);
    const page = data.query.pages;
    const pageId = Object.keys(page)[0];
    const content = page[pageId].revisions[0].slots.main['*'];

    const warnList = new Set();
    const cautionList = new Set();
    const inspectList = new Set();

    let currentList = null;

    const lines = content.split('\n');
    for (let line of lines) {
      if (line.trim() === warnSectionHeader) {
        currentList = warnList;
      } else if (line.trim() === cautionSectionHeader) {
        currentList = cautionList;
      } else if (line.trim() === inspectSectionHeader) {
        currentList = inspectList;
      }

      if (line.startsWith('*') && currentList) {
        const domain = line.substring(1).trim();
        currentList.add(domain);
      }
    }

    return {
      warnList,
      cautionList,
      inspectList
    };
  } catch (error) {
    console.error('Error fetching or parsing the page content:', error);
    throw error;
  }
}

async function fetchPreviousRevisionIds(revisionIds) {
  const params = {
    action: 'query',
    prop: 'revisions',
    revids: revisionIds.join('|'), // join all revision IDs
    rvprop: 'ids',
    format: 'json',
    origin: '*'
  };

  try {
    const data = await fetchFromActionAPI(params);
    const pages = data.query.pages;
    const revisionMap = {};
    for (const pageId in pages) {
      const revisions = pages[pageId].revisions;
      if (revisions && revisions.length > 0) {
        for (const revision of revisions) {
          revisionMap[revision.revid] = revision.parentid;
        }
      }
    }
    return revisionMap;
  } catch (error) {
    console.error('Error fetching previous revision IDs:', error);
    return {};
  }
}

async function fetchFirstRevisionId(pageID) {
  const params = {
    action: 'query',
    pageids: pageID,
    prop: 'revisions',
    rvlimit: 1,
    rvdir: 'newer',
    format: 'json',
    origin: '*'
  };

  try {
    const data = await fetchFromActionAPI(params);
    const pages = data.query.pages;
    const pageId = Object.keys(pages)[0];
    const revisions = pages[pageId].revisions;

    if (revisions && revisions.length > 0) {
      return revisions[0].revid;
    } else {
      throw new Error('No revisions found for this page.');
    }
  } catch (error) {
    console.error('Error fetching first revision ID:', error);
    return null;
  }
}

async function fetchDomainListPages(pageName) {
  const params = {
    action: 'query',
    prop: 'revisions',
    titles: pageName,
    rvprop: 'content',
    rvslots: '*',
    format: 'json',
    origin: '*'
  };

  try {
    const data = await fetchFromActionAPI(params);
    const page = data.query.pages;
    const pageId = Object.keys(page)[0];
    const content = page[pageId].revisions[0].slots.main['*'];

    const pageTitles = [];
    const lines = content.split('\n');
    for (let line of lines) {
      if (line.startsWith('* [[')) {
        const match = line.match(/\[\[([^\]]+)\]\]/); // Matches the first instance of [[Page Title]]
        if (match) {
          pageTitles.push(match[1]);
        }
      }
    }

    return pageTitles;
  } catch (error) {
    console.error('Error fetching or parsing the page content:', error);
    throw error;
  }
}

async function checkPageExists(pageName) {
  const params = {
    action: 'query',
    titles: pageName,
    format: 'json',
    origin: '*'
  };

  try {
    const data = await fetchFromActionAPI(params);
    const page = data.query.pages;
    const pageId = Object.keys(page)[0];
    return pageId !== "-1"; // Page exists if pageId is not "-1"
  } catch (error) {
    console.error('Error checking page existence:', error);
    return false;
  }
}

async function runScript() {
  publicSuffixSet = await fetchPublicSuffixList();
  if (publicSuffixSet.size === 0) {
    console.error('Public Suffix List loading failed');
    return;
  }

  const listPages = await fetchDomainListPages(listOfLists);
  for (const pageName of listPages) {
    const exists = await checkPageExists(pageName);
    if (exists) {
      const lists = await fetchAndOrganizeDomainLists(pageName);
      lists.warnList.forEach(warnList.add, warnList);
      lists.cautionList.forEach(cautionList.add, cautionList);
      lists.inspectList.forEach(inspectList.add, inspectList);
    }
  }

  const watchlistRevisions = await parseWatchlist();
  await fetchDiffAndProcess(watchlistRevisions);
}

runScript().then(() => console.log('Citation Watchlist script loaded'));