Jump to content

Wikipedia:Bots/Requests for approval/BHGbot 9/Step5 checker

From Wikipedia, the free encyclopedia
// AWB custom module to remove {{tl|Cleanup bare URLs}} when there are no remaining [[WP:Bare URLs|Bare URLs]]
// v0.07 18 October 2021
// -- BHG

// NOTE this version is hacked for testing purposes.
// It skips all pages except those which get to Step 5, then fail there.

public string botNV ()
{	string botName = "[[WP:BHGbot 9]]";
	string botVersion = "0.07 checker";
	
	string botTrial = " Trial";
	// string botTrial = "";
	return botName + "v" + botVersion + botTrial;
}

public string ProcessArticle(string ArticleText, string ArticleTitle, int wikiNamespace, out string Summary, out bool Skip)
{
    Skip = false;
    Summary = botNV() + ": ";

// String DECLARATIONS
	bool debugging = false;
	string debuggingEditSummary = "This is a test to debug " + botNV() + ". This edit should not have been saved, so please revert it";
	string successEditSummary = "Removed {{[[Template:Cleanup bare URLs|Cleanup bare URLs]]}}. This page currently has no bare URLs";

// article text variables
    string nuArticleText = ""; // The text that we will return if the tag is removed.
    string testArticleText = ArticleText; // A copy of the article which will be used for testing purposes

// tallies
	int CleanupBareURLsTagCount = 0;
	int bareURLinlineTagCount = 0;
	int bareURLrefCount = 0;
	int URLsremainingAfterRemovingNonBareURlsCount = 0;
	

// DECLARE some regexes needed later on

	string CleanupBareURLsTagMatcher = @"\s*\{\{ *([tT]emplate *: *)?([Cc]leanup[_ ]+bare[_ ]+URLs|[Bb]are[_ ]+|[Bb]are|[Bb]are[_ ]+link|[Bb]are[_ ]+linkname|[Bb]are[_ ]+links|[Bb]are[_ ]+references|[Bb]are[_ ]+refs|[Bb]are[_ ]+URL|[Bb]are[_ ]+URLs|[Bb]are-URLs|[Bb]arelinks|[Bb]areURL|[Bb]areURLs|[Cc]leanup[_ ]+bare-URLs|[Cc]leanup[_ ]+link[_ ]+rot|[Cc]leanup[_ ]+link-rot|[Cc]leanup-Bare[_ ]+URLs|[Cc]leanup-barelinks|[Cc]leanup-link[_ ]+rot|[Cc]leanup-link-rot|[Cc]leanup-linkrot|[Cc]UBURL|[Ll]ink[_ ]+rot|[Ll]INKROT|[Ll]R) *(\|[^\}]*)?\}\}";
	string bareURLinlineTagMatcher = @"\s*\{\{ *([tT]emplate *: *)?([Bb]are[_ ]+URL[\- ]inline|[Ll]inkrot-inline|[Bb]are-inline|[Bb]are[_ ]+inline|[Bb]are[_ ]+url[_ ]+inline|[Bb]are-url[_ ]+inline|[Bb]are[_ ]+link[_ ]+inline|[Bb]are-link-inline|[Bb]are-url-inline|[Bb]are[_ ]+url) *(\|[^\}]*)?\}\}";
	string bareURLinlineRefMatcher = @"<ref[^>]*?>\s*\[?\s*https?:[^>< \|\[\]]+\s*\]?\s*<\s*/\s*ref\s*>";
	string completeRefTagMatcher = @"<ref[^>]*?>[^<>]*<\s*/\s*ref\s*>";
	string citeTemplateMatcher = @"\{\{ *([tT]emplate *: *)?([Cc](ite|itation))[^\]\{]*\}\}"; // Yes, this is crude, and will miss some cases
                                                                                            // such as cites using {{sfnref}}, but it will do
                                                                                            // for a start
	string URLtemplateMatcher = @"{\{ *([tT]emplate *: *)?(URL|Websites|URLWww|URLUrlw|URLUrl|URLUR|URLSite|URLWebsite|URLپیوند وب)\s*(\|[^\}]*)?\}\}";
	string OfficialWebsiteOrOfficialURLtemplateMatcher = @"\{\{ *([tT]emplate *: *)?([Oo]fficial[_ ]+URL|[Oo]fficial[_ ]+website|[Cc]onditionalURL|[Cc]onditional[_ ]+URL|[Gg]et[_ ]+URL[_ ]+from[_ ]+WikiData|[Oo]fficialURL|[Oo]fficial[_ ]+url|[Oo]fficialSite|[Oo]fficial|[Cc]ompany[_ ]+Website|[Oo]fficial[_ ]+site|[Oo]fficial[_ ]+Website|[Oo]ffficial[_ ]+website|[Oo]fficial[_ ]+web[_ ]+site|[Oo]fficial[_ ]+homepage|[Hh]omepage|[Hh]ome[_ ]+page|[Oo]fficialwebsite|[Mm]ain[_ ]+website|[Oo]fficialsite|[Oo]fficial[_ ]+webpage|[Oo]fficial[_ ]+Site|[Oo]web)\s*(\|[^\}]*)?\}\}";
	string URLparameterMatcher = @"\|\s*(website|url)\s*=\s*https?:[^\|\}]*";
	string nonBareURLMatcher = @"\[\s*https?://[^>< \|\[\]]+\s+[^\]]+\]"; // a bit crude
	string BareURLMatcher = @"((?!<\[ *)https?://[^>< \|\[\]]+|\[ *https?:[^>< \|\[\]]+\s*\])"; // currently unused
	string anyURLMatcher = @"(?!<\w)https?://\w"; // is this enough?
	
// STEP 1.check that the page contains the banner template {{Cleanup bare URLs}}, or one of its many aliases. If not, skip the page
	MatchCollection CleanupBareURLsTagmatches = Regex.Matches(ArticleText, CleanupBareURLsTagMatcher, RegexOptions.Singleline);
	CleanupBareURLsTagCount = CleanupBareURLsTagmatches.Count;
	if (CleanupBareURLsTagCount == 0) {
		// No {{CleanupBareURLsTagMatcher}} tags, so skip this page
		if (debugging) {
			Skip = false;
			Summary = debuggingEditSummary;
			return MakeDebugMsg(1, false, "Page contains no {{tl|CleanupBareURLsTagMatcher}} tag.", false, ArticleText);
		}
	    Skip = true;
		return ArticleText;
	}

	// So we have a {{Cleanup bare URLs}} tag
	// Now create a copy of the page without the tag.  This is what we will save if there are no remaining Bare URLs
	nuArticleText = Regex.Replace(ArticleText, CleanupBareURLsTagMatcher, "", RegexOptions.Singleline);


// STEP 2. count the number of {{Bare URL inline}} tags in the page, including aliases
	MatchCollection bareURLinlineTagmatches = Regex.Matches(ArticleText, bareURLinlineTagMatcher, RegexOptions.Singleline);
	bareURLinlineTagCount = bareURLinlineTagmatches.Count;

// STEP 3. count the number of {{Bare URL inline}} tags in the page, including aliases
	MatchCollection bareURLrefmatches = Regex.Matches(ArticleText, bareURLinlineRefMatcher, RegexOptions.Singleline);
	bareURLrefCount = bareURLrefmatches.Count;
	
// STEP 4. if the total matches of step 2 + step 3 is greater than zero, then skip the page
	if ((bareURLinlineTagCount + bareURLrefCount) > 0) {
		// This page still has some bare URL refs, so skip this page
		// No {{CleanupBareURLsTagMatcher}} tags, so skip this page
		if (debugging) {
			Skip = false;
			Summary = debuggingEditSummary;
			return MakeDebugMsg(4, false, "Page still has some bare URL refs.\n* bareURLinlineTagCount=" + bareURLinlineTagCount + "\n* bareURLrefCount=" + bareURLrefCount, false, ArticleText);
		}
	    Skip = true;
		return ArticleText;
	}

// STEP 5. check for bare URLs not in ref tags
//
// In this step we proceed by working on a copy of the article from which we remove URls which are known to be OK
// Then we check whether any bare URLs remain

// STEP 5.A: remove all ref tags.
//            We have already checked for any bare URLs inside ref tags, so we can just remove all ref tags and their contents.
	testArticleText = Regex.Replace(testArticleText, completeRefTagMatcher, "", RegexOptions.Singleline | RegexOptions.IgnoreCase);
	
// STEP 5.B: remove all {{cite}} templates.
//            Anything inside a {{cite}} template is good, so just remove the whole template
	testArticleText = Regex.Replace(testArticleText, citeTemplateMatcher, "", RegexOptions.Singleline | RegexOptions.IgnoreCase);

// STEP 5.C: remove any {{URL}} templates.
//            Anything inside a {{URL}} template is good, so just remove the whole template
	testArticleText = Regex.Replace(testArticleText, URLtemplateMatcher, "", RegexOptions.Singleline | RegexOptions.IgnoreCase);

// STEP 5.D: remove any {{Official URL}} or {{Official website}} templates.
//            Anything inside an {{Official URL}} or {{Official website}} template is good, so just remove the whole template
	testArticleText = Regex.Replace(testArticleText, OfficialWebsiteOrOfficialURLtemplateMatcher, "", RegexOptions.Singleline | RegexOptions.IgnoreCase);

// STEP 5.E: remove any URL which is a value of a template parameter "url=" or "website="
//            e.g. "|website=https://example.com" or "|url=https://example.com"
	testArticleText = Regex.Replace(testArticleText, URLparameterMatcher, "", RegexOptions.Singleline | RegexOptions.IgnoreCase);

// STEP 5.F: remove any non-bare URLs
//            e.g. "[https://example.com foo]"
	testArticleText = Regex.Replace(testArticleText, nonBareURLMatcher, "", RegexOptions.Singleline | RegexOptions.IgnoreCase);

// STEP 6: does the page still contain any URLs?
	MatchCollection RemainingURLsMatches = Regex.Matches(testArticleText, anyURLMatcher, RegexOptions.Singleline | RegexOptions.IgnoreCase);
	URLsremainingAfterRemovingNonBareURlsCount = RemainingURLsMatches.Count;
	if (URLsremainingAfterRemovingNonBareURlsCount == 0) {
		// SUCCESS! No bare URLs, so we can remove the tag
		if (debugging) {
			Skip = false;
			Summary = debuggingEditSummary;
			return MakeDebugMsg(6, true, "Page contains no [[WP:Bare URLs]].", true, nuArticleText);
		}
	    Skip = true;
		Summary = botNV() + ": " + successEditSummary;
		return ArticleText;
	}

// FAILURE
// If we get here, then the page still contains bare URLs
	Skip = false;
	return "STEP 5 FAIL \n\n\n" +ArticleText;
}

public string MakeDebugMsg(int stepNum, bool testsOK, string debugMessage, bool textChanged, string pageText)
{
	string retval = "DEBUGGING " + botNV() + ". --- This edit should NOT have been saved. Please revert.\n";

	retval = retval + "\nSTEP: " + stepNum;
	retval = retval + "\nSTATUS: ";
	if (testsOK) {
		retval = retval + "Success";
	}
	else {
		retval = retval + "Fail";
	}
	retval = retval + "\nNOTES: " + debugMessage;
	
	retval = retval + "\n\nArticle text follows below the line. ";
	
	if (textChanged) {
		retval = retval + "This text has been modified";
	}
	else {
		retval = retval + "This is the original text, unmodified\n";
	}
	retval = retval + "\n\n____________________________________";

	return retval;
}