Jump to content

User:Tom.Bot/Task6 code

From Wikipedia, the free encyclopedia

Source

[edit]
public string ProcessArticle(string ArticleText, string ArticleTitle, int wikiNamespace, out string Summary, out bool Skip)
{
	// global switches //////////////////////////////////////////////////////////
	
	bool TomBot = true;
	bool SaveSkipSummaries = false;
	bool SkipIfBlacklisted = true;
	bool ManuallyCheckPagesWithoutAGoodInfobox = false; // usually it's an {{infobox person}} or {{infobox scientist}}
	bool ManuallyPlaceAuthorityAtEndOfPage = false; // aid for pages w/o a {{DEFAULTSORT}} nor cats; manual use only
	bool LiveDebug = false;
	bool SandboxDebug = false; // auto-detect
	Skip = false;
	
	
	// global-use vars //////////////////////////////////////////////////////////
	
	Summary = "";
	
	
	// preliminary exceptions/error checking ////////////////////////////////////
	
	if (ArticleTitle == "User:Tom.Reding/sandbox") SandboxDebug = true;
	
	List<string> BlackList = new List<string>(new string[] {
		""
	});
	if (!Skip && BlackList.Contains(ArticleTitle))
	{
		if (SkipIfBlacklisted)
		{
			Summary = "Blacklisted article";
			Skip = true;
		}
	}
	
	// check for appropriate (bio) infoboxes (now done via PetScan for all templates in [[Category:People and person infobox templates]], per BRFA)
	
	string PeopleTemplates_Regex = @"\{\{\s*(?:[Ii]nfobox[ _]+actor[ _]+voice|[Ii]nfobox[ _]+Actor|[Ii]nfobox[ _]+actor|[Ii]nfobox[ _]+Actress|[Ii]nfobox[ _]+actress|[Ii]nfobox[ _]+adult[ _]+biography|[Ii]nfobox[ _]+adult[ _]+female|[Ii]nfobox[ _]+adult[ _]+male|[Ii]nfobox[ _]+Biography|[Ii]nfobox[ _]+biography|[Ii]nfobox[ _]+bio|[Ii]nfobox[ _]+Celebrity|[Ii]nfobox[ _]+director|[Ii]nfobox[ _]+entertainer|[Ii]nfobox[ _]+Fashion[ _]+Designer|[Ii]nfobox[ _]+fashion[ _]+designer|[Ii]nfobox[ _]+film[ _]+actor|[Ii]nfobox[ _]+film[ _]+director|[Ii]nfobox[ _]+human[ _]+being|[Ii]nfobox[ _]+human|[Ii]nfobox[ _]+Indian[ _]+Businessmen|[Ii]nfobox[ _]+Journalist|[Ii]nfobox[ _]+journalist|[Ii]nfobox[ _]+people|[Ii]nfobox[ _]+performer|[Ii]nfobox[ _]+person/measurements|[Ii]nfobox[ _]+person[ _]+ii|[Ii]nfobox[ _]+person|[Ii]nfobox[ _]+Person|[Ii]nfobox[ _]+photographer|[Ii]nfobox[ _]+Real[ _]+Person|[Ii]nfobox[ _]+trade[ _]+unionist|[Ii]nfobox[ _]+victim|[Pp]ersonbox)(?=\s*(?:\||\<\!\-\-))";
	
	string ScientistTemplates_Regex = @"\{\{\s*(?:[Ii]nfobox[ _]+Academic|[Ii]nfobox[ _]+chemist|[Ii]nfobox[ _]+historian|[Ii]nfobox[ _]+mathematician|[Ii]nfobox[ _]+Professor|[Ii]nfobox[ _]+scientist|[Ii]nfobox[ _]+Scientist)(?=\s*(?:\||\<\!\-\-))";
	
	bool Bio1 = Regex.IsMatch(ArticleText, PeopleTemplates_Regex, RegexOptions.IgnoreCase);
	bool Bio2 = Regex.IsMatch(ArticleText, ScientistTemplates_Regex, RegexOptions.IgnoreCase);
	bool NoBioTemplates = (Bio1 == false && Bio2 == false);
	if (!Skip && NoBioTemplates)
	{
		if (ManuallyCheckPagesWithoutAGoodInfobox)
		{
			// OK to proceed (manually)
		}
		else
		{
			Summary += @"No bio templates found. ";
			Skip = true;
		}
	}
	
	// check for {{Authority control
	if (!Skip)
	{
		string AuthorityAliases_Regex = @"\{\{\s*(?:[Aa]uthoritycontrol|[Aa]uthority[ _]+controll|[Aa]uthority[ _]+control|[Aa]uthority[ _]+Control|[Aa]utorité|[Ee]xternal[ _]+identifiers|[Nn]ormdaten)"; // 0 grps
		bool HasAuthority = Regex.IsMatch(ArticleText, AuthorityAliases_Regex, RegexOptions.IgnoreCase);
		if (HasAuthority)
		{
			Summary += @"{{Authority control}} exists. ";
			Skip = true;
		}
	}
	
	// get wikibase_item via WP API
	// ex: https://en.wikipedia.org//w/api.php?action=query&format=json&prop=pageprops&titles=Panthera%20leo&redirects=0&formatversion=2&ppprop=wikibase_item
	// TODO: find a proper URL_Encode function that behaves similarly
	string ArticleTitle_URL = ArticleTitle.Replace(" ", @"%20").Replace(",", @"%2C").Replace("'", @"%27").Replace("-", @"%2D").Replace("–", @"%96").Replace("(", @"%28").Replace(")", @"%29").Replace(".", @"%2E").Replace("&", @"%26").Replace("?", @"%3F").Replace("+", @"%2B").Replace(":", @"%3A").Replace("!", @"%21").Replace("/", @"%2F").Replace(@"\", @"%5C");
	string URL1 = @"https://en.wikipedia.org//w/api.php?action=query&format=json&prop=pageprops&titles=" + 
						ArticleTitle_URL + @"&redirects=0&formatversion=2&ppprop=wikibase_item";
	string HTML1 = "";
	if (!Skip && !SandboxDebug)
	{
		try
		{
			HTML1 = Tools.GetHTML(URL1);
		}
		catch
		{
			Summary = "GetHTML1 failed. ArticleTitle_URL = " + ArticleTitle_URL + " . ";
			if (!LiveDebug) Skip = true;
		}
	}
	
	
	// html1 error checks ///////////////////////////////////////////////////////
	
	string QID = Regex.Match(HTML1, @"wikibase_item"":""([^""]+)").Groups[1].Value;
	if (string.IsNullOrEmpty(QID) && !Skip && !SandboxDebug)
	{
		Summary = @"QID retrieval failed. ";
		Skip = true;
	}
	
	if (!Regex.IsMatch(QID, @"^Q\d+$") && !Skip && !SandboxDebug) // case sensitive, jtbs
	{
		Summary = @"Unexpected QID format. ";
		Skip = true;
	}
	
	
	// determine # of WD properties used ////////////////////////////////////////
	
	List<string> ACPropertyList = new List<string>(new string[] {
		// from Module:Authority control's local conf = { ... } table:
		"P864",
		"P2558",
		"P3293",
		"P1015",
		"P2092",
		"P950",
		"P268",
		"P428",
		"P651",
		"P271",
		"P2456",
		"P227",
		"P902",
		"P213",
		"P347",
		"P1248",
		"P244",
		"P886",
		"P640",
		"P434",
		"P549",
		"P1225",
		"P1223",
		"P1222",
		"P1048",
		"P349",
		"P691",
		"P409",
		"P496",
		"P2750",
		"P1053",
		"P650",
		"P350",
		"P947",
		"P396",
		"P906",
		"P781",
		"P3430",
		"P269",
		"P1362",
		"P245",
		"P1157",
		"P214"
	});
	
	// get Wikidata
	// ex: https://www.wikidata.org//w/api.php?action=wbgetclaims&format=json&entity=Q184201
	string URL2 = @"https://www.wikidata.org//w/api.php?action=wbgetclaims&format=json&entity=" + QID;
	string HTML2 = "";
	if (!Skip && !SandboxDebug)
	{
		try
		{
			HTML2 = Tools.GetHTML(URL2);
		}
		catch
		{
			Summary = "GetHTML2 failed. URL2 = " + URL2 + " . ";
			if (!LiveDebug) Skip = true;
		}
	}
	
	// scrape Wikidata
	// example text surrounding a populated property from
	// https://www.wikidata.org/w/api.php?action=wbgetclaims&entity=Q184201 :
	//        "P227": [
	//            {
	//                "mainsnak": {
	//                    "snaktype": "value",
	//                    "property": "P227",
	//                    "hash": "275a0595679f80411271280f2ee7344a94dfbeb6",
	//                    "datavalue": {
	//                        "value": "4776869-1",
	//                        "type": "string"
	//                    },
	//                    "datatype": "external-id"
	//                },
	int iProps = 0;
	if (!Skip && !SandboxDebug)
	{
		foreach (string p in ACPropertyList)
		{
			string p_regex = @"""property"":\s*""" + p + @""",[^\{\}]*""datavalue"":\s*\{\s*""value"":\s*""[^""]+""";
			bool Found = Regex.IsMatch(HTML2, p_regex);
			if (Found) iProps++;
		}
		
		if (iProps == 0)
		{
			Summary = @"0 IDs on Wikidata. ";
			Skip = true;
		}
	}
	
	
	// main /////////////////////////////////////////////////////////////////////
	
	if (!Skip)
	{
		if (SandboxDebug)
		{
			iProps = 1;
			QID = "1";
		}
		
		// std {{DEFAULTSORT
		string DF_Regex = @"\{\{\s*(?:DEFAULTSORT|[Dd]efaultSort|[Dd]efaultsort|DEFAULT[ _]+SORT|[Dd]efault[ _]+sort|[Ss]ORTIERUNG:Lasorling|SORTIERUNG)(?=[:\|\}])";
		ArticleText = Regex.Replace(ArticleText, DF_Regex, @"{{DEFAULTSORT", RegexOptions.IgnoreCase);
		
		// Move {{-stub}} tag closer to end of page, otherwise GenFixes adds an extra line before {{Authority control}} that can't be fixed w/o a reparse.
		// Leading "\s*" replaced with "\n" fix cases like "{{reflist}}{{blah-stub}}" on the same line.
		string MoveStubAfterCat_Regex = @"\s*(\{\{[^\{\}]*[ -]stub\s*\}\})\s*(\[\[\s*Category[^\[\]]+\]\])";
		ArticleText = Regex.Replace(ArticleText, MoveStubAfterCat_Regex, "\n" + @"$2" + "\n" + @"$1", RegexOptions.IgnoreCase);
		
		string AuthorityComplete = @"{{Authority control}}";
		string AddBeforeCats_Regex = @"(^[\d\D]+?)(?=[\r\n]+[ 	]*(?:\{\{DEFAULTSORT|\[\[\s*Category))"; // better results than adding after last cat
		
		string Plural = (iProps > 1) ? "s" : "";
		string SuccessSummary = @"+{{[[Template:Authority control|Authority control]]}}";
		if (TomBot) SuccessSummary = @"[[Wikipedia:Bots/Requests for approval/Tom.Bot 6|Task 6]]: " + SuccessSummary;
		if (iProps > 0) SuccessSummary += " (" + iProps + @" source" + Plural + @" from Wikidata)";
		SuccessSummary += ", [[WP:GenFixes]] on,";
		
		bool NoCat = !Regex.IsMatch(ArticleText, AddBeforeCats_Regex, RegexOptions.IgnoreCase);
		if (NoCat)
		{
			if (ManuallyPlaceAuthorityAtEndOfPage)
			{
				ArticleText += "\n" + AuthorityComplete;
				Summary = SuccessSummary + " (uncategorized page) ";
			}
			else
			{
				Summary += @"No cats/DEFAULTSORT to anchor {{Authority control}} to. Batch manually/code later. ";
				Skip = true;
			}
		}
		else
		{
			ArticleText = Regex.Replace(ArticleText, AddBeforeCats_Regex, @"$1" + "\n" + AuthorityComplete, RegexOptions.IgnoreCase);
			Summary = SuccessSummary;
		}
	}
	
	
	// exception tracking ///////////////////////////////////////////////////////
	
	if (Skip && SaveSkipSummaries && !SandboxDebug)
	{
		string Message = ArticleTitle + "\t" + Summary + "\n";
		string File = @"Module output - Add {{Authority control}} (skip summaries).txt";
		string Path = @"F:\"; // desktop
		string FullPath = Path + File;
		const bool APPEND = true;
		Tools.WriteTextFileAbsolutePath(Message, FullPath, APPEND);
	}
	
	if (LiveDebug || SandboxDebug) Skip = false;
	
	return ArticleText;
}