User:Tom.Bot/Task6 code
Appearance
Source
[edit]public string ProcessArticle(string ArticleText, string ArticleTitle, int wikiNamespace, out string Summary, out bool Skip)
{
// global switches //////////////////////////////////////////////////////////
bool TomBot = true;
bool SaveSkipSummaries = false;
bool SkipIfBlacklisted = true;
bool ManuallyCheckPagesWithoutAGoodInfobox = false; // usually it's an {{infobox person}} or {{infobox scientist}}
bool ManuallyPlaceAuthorityAtEndOfPage = false; // aid for pages w/o a {{DEFAULTSORT}} nor cats; manual use only
bool LiveDebug = false;
bool SandboxDebug = false; // auto-detect
Skip = false;
// global-use vars //////////////////////////////////////////////////////////
Summary = "";
// preliminary exceptions/error checking ////////////////////////////////////
if (ArticleTitle == "User:Tom.Reding/sandbox") SandboxDebug = true;
List<string> BlackList = new List<string>(new string[] {
""
});
if (!Skip && BlackList.Contains(ArticleTitle))
{
if (SkipIfBlacklisted)
{
Summary = "Blacklisted article";
Skip = true;
}
}
// check for appropriate (bio) infoboxes (now done via PetScan for all templates in [[Category:People and person infobox templates]], per BRFA)
string PeopleTemplates_Regex = @"\{\{\s*(?:[Ii]nfobox[ _]+actor[ _]+voice|[Ii]nfobox[ _]+Actor|[Ii]nfobox[ _]+actor|[Ii]nfobox[ _]+Actress|[Ii]nfobox[ _]+actress|[Ii]nfobox[ _]+adult[ _]+biography|[Ii]nfobox[ _]+adult[ _]+female|[Ii]nfobox[ _]+adult[ _]+male|[Ii]nfobox[ _]+Biography|[Ii]nfobox[ _]+biography|[Ii]nfobox[ _]+bio|[Ii]nfobox[ _]+Celebrity|[Ii]nfobox[ _]+director|[Ii]nfobox[ _]+entertainer|[Ii]nfobox[ _]+Fashion[ _]+Designer|[Ii]nfobox[ _]+fashion[ _]+designer|[Ii]nfobox[ _]+film[ _]+actor|[Ii]nfobox[ _]+film[ _]+director|[Ii]nfobox[ _]+human[ _]+being|[Ii]nfobox[ _]+human|[Ii]nfobox[ _]+Indian[ _]+Businessmen|[Ii]nfobox[ _]+Journalist|[Ii]nfobox[ _]+journalist|[Ii]nfobox[ _]+people|[Ii]nfobox[ _]+performer|[Ii]nfobox[ _]+person/measurements|[Ii]nfobox[ _]+person[ _]+ii|[Ii]nfobox[ _]+person|[Ii]nfobox[ _]+Person|[Ii]nfobox[ _]+photographer|[Ii]nfobox[ _]+Real[ _]+Person|[Ii]nfobox[ _]+trade[ _]+unionist|[Ii]nfobox[ _]+victim|[Pp]ersonbox)(?=\s*(?:\||\<\!\-\-))";
string ScientistTemplates_Regex = @"\{\{\s*(?:[Ii]nfobox[ _]+Academic|[Ii]nfobox[ _]+chemist|[Ii]nfobox[ _]+historian|[Ii]nfobox[ _]+mathematician|[Ii]nfobox[ _]+Professor|[Ii]nfobox[ _]+scientist|[Ii]nfobox[ _]+Scientist)(?=\s*(?:\||\<\!\-\-))";
bool Bio1 = Regex.IsMatch(ArticleText, PeopleTemplates_Regex, RegexOptions.IgnoreCase);
bool Bio2 = Regex.IsMatch(ArticleText, ScientistTemplates_Regex, RegexOptions.IgnoreCase);
bool NoBioTemplates = (Bio1 == false && Bio2 == false);
if (!Skip && NoBioTemplates)
{
if (ManuallyCheckPagesWithoutAGoodInfobox)
{
// OK to proceed (manually)
}
else
{
Summary += @"No bio templates found. ";
Skip = true;
}
}
// check for {{Authority control
if (!Skip)
{
string AuthorityAliases_Regex = @"\{\{\s*(?:[Aa]uthoritycontrol|[Aa]uthority[ _]+controll|[Aa]uthority[ _]+control|[Aa]uthority[ _]+Control|[Aa]utorité|[Ee]xternal[ _]+identifiers|[Nn]ormdaten)"; // 0 grps
bool HasAuthority = Regex.IsMatch(ArticleText, AuthorityAliases_Regex, RegexOptions.IgnoreCase);
if (HasAuthority)
{
Summary += @"{{Authority control}} exists. ";
Skip = true;
}
}
// get wikibase_item via WP API
// ex: https://en.wikipedia.org//w/api.php?action=query&format=json&prop=pageprops&titles=Panthera%20leo&redirects=0&formatversion=2&ppprop=wikibase_item
// TODO: find a proper URL_Encode function that behaves similarly
string ArticleTitle_URL = ArticleTitle.Replace(" ", @"%20").Replace(",", @"%2C").Replace("'", @"%27").Replace("-", @"%2D").Replace("–", @"%96").Replace("(", @"%28").Replace(")", @"%29").Replace(".", @"%2E").Replace("&", @"%26").Replace("?", @"%3F").Replace("+", @"%2B").Replace(":", @"%3A").Replace("!", @"%21").Replace("/", @"%2F").Replace(@"\", @"%5C");
string URL1 = @"https://en.wikipedia.org//w/api.php?action=query&format=json&prop=pageprops&titles=" +
ArticleTitle_URL + @"&redirects=0&formatversion=2&ppprop=wikibase_item";
string HTML1 = "";
if (!Skip && !SandboxDebug)
{
try
{
HTML1 = Tools.GetHTML(URL1);
}
catch
{
Summary = "GetHTML1 failed. ArticleTitle_URL = " + ArticleTitle_URL + " . ";
if (!LiveDebug) Skip = true;
}
}
// html1 error checks ///////////////////////////////////////////////////////
string QID = Regex.Match(HTML1, @"wikibase_item"":""([^""]+)").Groups[1].Value;
if (string.IsNullOrEmpty(QID) && !Skip && !SandboxDebug)
{
Summary = @"QID retrieval failed. ";
Skip = true;
}
if (!Regex.IsMatch(QID, @"^Q\d+$") && !Skip && !SandboxDebug) // case sensitive, jtbs
{
Summary = @"Unexpected QID format. ";
Skip = true;
}
// determine # of WD properties used ////////////////////////////////////////
List<string> ACPropertyList = new List<string>(new string[] {
// from Module:Authority control's local conf = { ... } table:
"P864",
"P2558",
"P3293",
"P1015",
"P2092",
"P950",
"P268",
"P428",
"P651",
"P271",
"P2456",
"P227",
"P902",
"P213",
"P347",
"P1248",
"P244",
"P886",
"P640",
"P434",
"P549",
"P1225",
"P1223",
"P1222",
"P1048",
"P349",
"P691",
"P409",
"P496",
"P2750",
"P1053",
"P650",
"P350",
"P947",
"P396",
"P906",
"P781",
"P3430",
"P269",
"P1362",
"P245",
"P1157",
"P214"
});
// get Wikidata
// ex: https://www.wikidata.org//w/api.php?action=wbgetclaims&format=json&entity=Q184201
string URL2 = @"https://www.wikidata.org//w/api.php?action=wbgetclaims&format=json&entity=" + QID;
string HTML2 = "";
if (!Skip && !SandboxDebug)
{
try
{
HTML2 = Tools.GetHTML(URL2);
}
catch
{
Summary = "GetHTML2 failed. URL2 = " + URL2 + " . ";
if (!LiveDebug) Skip = true;
}
}
// scrape Wikidata
// example text surrounding a populated property from
// https://www.wikidata.org/w/api.php?action=wbgetclaims&entity=Q184201 :
// "P227": [
// {
// "mainsnak": {
// "snaktype": "value",
// "property": "P227",
// "hash": "275a0595679f80411271280f2ee7344a94dfbeb6",
// "datavalue": {
// "value": "4776869-1",
// "type": "string"
// },
// "datatype": "external-id"
// },
int iProps = 0;
if (!Skip && !SandboxDebug)
{
foreach (string p in ACPropertyList)
{
string p_regex = @"""property"":\s*""" + p + @""",[^\{\}]*""datavalue"":\s*\{\s*""value"":\s*""[^""]+""";
bool Found = Regex.IsMatch(HTML2, p_regex);
if (Found) iProps++;
}
if (iProps == 0)
{
Summary = @"0 IDs on Wikidata. ";
Skip = true;
}
}
// main /////////////////////////////////////////////////////////////////////
if (!Skip)
{
if (SandboxDebug)
{
iProps = 1;
QID = "1";
}
// std {{DEFAULTSORT
string DF_Regex = @"\{\{\s*(?:DEFAULTSORT|[Dd]efaultSort|[Dd]efaultsort|DEFAULT[ _]+SORT|[Dd]efault[ _]+sort|[Ss]ORTIERUNG:Lasorling|SORTIERUNG)(?=[:\|\}])";
ArticleText = Regex.Replace(ArticleText, DF_Regex, @"{{DEFAULTSORT", RegexOptions.IgnoreCase);
// Move {{-stub}} tag closer to end of page, otherwise GenFixes adds an extra line before {{Authority control}} that can't be fixed w/o a reparse.
// Leading "\s*" replaced with "\n" fix cases like "{{reflist}}{{blah-stub}}" on the same line.
string MoveStubAfterCat_Regex = @"\s*(\{\{[^\{\}]*[ -]stub\s*\}\})\s*(\[\[\s*Category[^\[\]]+\]\])";
ArticleText = Regex.Replace(ArticleText, MoveStubAfterCat_Regex, "\n" + @"$2" + "\n" + @"$1", RegexOptions.IgnoreCase);
string AuthorityComplete = @"{{Authority control}}";
string AddBeforeCats_Regex = @"(^[\d\D]+?)(?=[\r\n]+[ ]*(?:\{\{DEFAULTSORT|\[\[\s*Category))"; // better results than adding after last cat
string Plural = (iProps > 1) ? "s" : "";
string SuccessSummary = @"+{{[[Template:Authority control|Authority control]]}}";
if (TomBot) SuccessSummary = @"[[Wikipedia:Bots/Requests for approval/Tom.Bot 6|Task 6]]: " + SuccessSummary;
if (iProps > 0) SuccessSummary += " (" + iProps + @" source" + Plural + @" from Wikidata)";
SuccessSummary += ", [[WP:GenFixes]] on,";
bool NoCat = !Regex.IsMatch(ArticleText, AddBeforeCats_Regex, RegexOptions.IgnoreCase);
if (NoCat)
{
if (ManuallyPlaceAuthorityAtEndOfPage)
{
ArticleText += "\n" + AuthorityComplete;
Summary = SuccessSummary + " (uncategorized page) ";
}
else
{
Summary += @"No cats/DEFAULTSORT to anchor {{Authority control}} to. Batch manually/code later. ";
Skip = true;
}
}
else
{
ArticleText = Regex.Replace(ArticleText, AddBeforeCats_Regex, @"$1" + "\n" + AuthorityComplete, RegexOptions.IgnoreCase);
Summary = SuccessSummary;
}
}
// exception tracking ///////////////////////////////////////////////////////
if (Skip && SaveSkipSummaries && !SandboxDebug)
{
string Message = ArticleTitle + "\t" + Summary + "\n";
string File = @"Module output - Add {{Authority control}} (skip summaries).txt";
string Path = @"F:\"; // desktop
string FullPath = Path + File;
const bool APPEND = true;
Tools.WriteTextFileAbsolutePath(Message, FullPath, APPEND);
}
if (LiveDebug || SandboxDebug) Skip = false;
return ArticleText;
}