User:Reedy/AWB/WikiProject User scripts Formatter

From Wikipedia, the free encyclopedia

Wikipedia:WikiProject User scripts/Scripts/Formatter

public string ProcessArticle(string ArticleText, string ArticleTitle, int wikiNamespace, out string Summary, out bool Skip)
        {
            Skip = false;
            Summary = "";
 
            ArticleText = catFixer(ArticleText);
            ArticleText = entities(ArticleText);
            ArticleText = fixheadings(ArticleText);
            ArticleText = fixsyntax(ArticleText);
            ArticleText = linkfixer(ArticleText, false);
            //ArticleText = imagefixer(ArticleText);
            ArticleText = whitespace(ArticleText);
            ArticleText = trim(ArticleText);
            ArticleText = trim(ArticleText);
 
            return ArticleText;
        }
 
        private string whitespace(string ArticleText)
        {
            ArticleText = Regex.Replace(ArticleText, @"/\t/g", " ");
 
            ArticleText = Regex.Replace(ArticleText, @"/^ ? ? \n/gm", "\n");
            ArticleText = Regex.Replace(ArticleText, @"/(\n\n)\n+/g", "$1");
            ArticleText = Regex.Replace(ArticleText, @"/== ? ?\n\n==/g", "==\n==");
            ArticleText = Regex.Replace(ArticleText, @"/\n\n(\* ?\[?http)/g", "\n$1");
 
            ArticleText = Regex.Replace(ArticleText, @"/^ ? ? \n/gm", "\n");
            ArticleText = Regex.Replace(ArticleText, @"/\n\n\*/g", "\n*");
            ArticleText = Regex.Replace(ArticleText, @"/[ \t][ \t]+/g", " ");
            ArticleText = Regex.Replace(ArticleText, @"/([=\n]\n)\n+/g", "$1");
            ArticleText = Regex.Replace(ArticleText, @"/ \n/g", "\n");
 
            //* bullet points
            ArticleText = Regex.Replace(ArticleText, @"/^([\*#]+) /gm", "$1");
            ArticleText = Regex.Replace(ArticleText, @"/^([\*#]+)/gm", "$1 ");
 
            //==Headings==
            ArticleText = Regex.Replace(ArticleText, @"/^(={1,4}) ?(.*?) ?(={1,4})$/gm", "$1$2$3");
 
            //dash — spacing
            ArticleText = Regex.Replace(ArticleText, @"/ ?(–|–|–|–|–) ?/g", "$1");
            ArticleText = Regex.Replace(ArticleText, @"/ ?(—|—|—|—|—) ?/g", "$1");
            ArticleText = Regex.Replace(ArticleText, @"/([^1-9])(—|—|—|—|—|–|–|–|–|–)([^1-9])/g", "$1 $2 $3");
 
            return trim(ArticleText);
        }
 
        private string entities(string ArticleText)
        {
            //ArticleText = Regex.Replace(ArticleText, @"//g", "");
            ArticleText = Regex.Replace(ArticleText, @"/–|–|–/g", "–");
            ArticleText = Regex.Replace(ArticleText, @"/—|—|—/g", "—");
            // ArticleText = Regex.Replace(ArticleText, @"/(cm| m|km|mi)<sup>2</sup>/g", "$1²");
            ArticleText = Regex.Replace(ArticleText, @"/&sup2;/g", "²");
            ArticleText = Regex.Replace(ArticleText, @"/&deg;/g", "°");
 
            return trim(ArticleText);
        }
 
        //Fix ==See also== and similar section common errors.
        private string fixheadings(string ArticleText)
        {
            if (!Regex.Match(ArticleText, "/= ?See also ?=/").Success)
                ArticleText = Regex.Replace(ArticleText, "/(== ?)(see also:?|related topics:?|related articles:?|internal links:?|also see:?)( ?==)/gi", "$1See also$3");
 
            ArticleText = Regex.Replace(ArticleText, "/(== ?)(external links?:?|outside links?|web ?links?:?|exterior links?:?)( ?==)/gi", "$1External links$3");
            ArticleText = Regex.Replace(ArticleText, "/(== ?)(references?:?)( ?==)/gi", "$1References$3");
            ArticleText = Regex.Replace(ArticleText, "/(== ?)(sources?:?)( ?==)/gi", "$1Sources$3");
            ArticleText = Regex.Replace(ArticleText, "/(== ?)(further readings?:?)( ?==)/gi", "$1Further reading$3");
 
            return ArticleText;
        }
 
        private string catFixer(string ArticleText)
        {
            ArticleText = Regex.Replace(ArticleText, @"/\[\[ ?[Cc]ategory ?: ?/g", "[[Category:");
 
            return trim(ArticleText);
        }
 
        //fixes many common syntax problems
        private string fixsyntax(string ArticleText)
        {
            //replace html with wiki syntax
            if (!Regex.Match(ArticleText, @"/'<\/?[ib]>|<\/?[ib]>'/gi").Success)
            {
                ArticleText = Regex.Replace(ArticleText, @"/<i>(.*?)<\/i>/gi", "''$1''");
                ArticleText = Regex.Replace(ArticleText, @"/<b>(.*?)<\/b>/gi", "'''$1'''");
            }
            ArticleText = Regex.Replace(ArticleText, @"/<br\/>/gi", "<br />");
            ArticleText = Regex.Replace(ArticleText, @"/<br>/gi", "<br />");
 
            return trim(ArticleText);
        }
 
        //formats links in standard fashion
        private string linkfixer(string ArticleText, bool checkImages)
        {
            ArticleText = Regex.Replace(ArticleText, @"/\]\[/g", "] [");
            Match m = Regex.Match(ArticleText, @"/\[?\[[^\]]*?\]\]?/g");
            if (m.Success)
            {
                for (int i = 0; i < m.Length; i++)
                {
                    string x = m.Groups[i].ToString();
                    string y = x;
 
                    //internal links only
                    if (!Regex.Match(y, @"/^\[?\[http:\/\//i").Success && !Regex.Match(y, @"/^\[?\[image:/i").Success)
                    {
                        if (y.IndexOf(":") == -1 && y.Substring(0, 3) != "[[_" && y.IndexOf("|_") == -1)
                        {
                            if (y.IndexOf("|") == -1)
                                y = Regex.Replace(y, @"/_/g", " ");
                            else
                                y = y.Replace(y.Substring(0, y.IndexOf("|")), Regex.Replace(y.Substring(0, y.IndexOf("|")), "/_/g", " "));
                        }
 
                        y = Regex.Replace(y, @"/ ?\| ?/", "|");
                        y = Regex.Replace(y, "|]]", "| ]]");
 
                    }
 
                    ArticleText = ArticleText.Replace(x, y);
                }
            }
 
            //repair bad internal links
            ArticleText = Regex.Replace(ArticleText, @"/\[\[ ?([^\]]*?) ?\]\]/g", "[[$1]]");
            ArticleText = Regex.Replace(ArticleText, @"/\[\[([^\]]*?)( |_)#([^\]]*?)\]\]/g", "[[$1#$3]]");
 
            //repair bad external links
            ArticleText = Regex.Replace(ArticleText, @"/\[?\[http:\/\/([^\]]*?)\]\]?/gi", "[http://$1]");
            ArticleText = Regex.Replace(ArticleText, @"/\[http:\/\/([^\]]*?)\|([^\]]*?)\]/gi", "[http://$1 $2]");
 
            return trim(ArticleText);
        }
 
        //fixes images
        private string imagefixer(string ArticleText)
        {
 
            //remove external images
            ArticleText = Regex.Replace(ArticleText, @"/\[?\[image:http:\/\/([^\]]*?)\]\]?/gi", "[http://$1]");
 
            //fix links within internal images
            Match m = Regex.Match(ArticleText, @"/\[?\[image:[^\[\]]*?(\[?\[[^\]]*?\]*?[^\[\]]*?)*?\]+/gi");
            if (m.Success)
            {
                for (int i = 0; i < m.Length; i++)
                {
                    string x = m.Groups[i].ToString();
                    string y = x;
 
                    y = Regex.Replace(y, @"/^\[\[i/i", "I");
                    y = Regex.Replace(y, @"/\]\]$/", "");
                    y = Regex.Replace(y, @"/(\[[^\]]*?)$/", "$1]");
                    y = linkfixer(y, true);
                    y = "[[" + y + "]]";
 
                    ArticleText = ArticleText.Replace(x, y);
                }
            }
 
            return trim(ArticleText);
        }
 
        //trim start and end, trim spaces from the end of lines
        private string trim(string ArticleText)
        {
            ArticleText = Regex.Replace(ArticleText, "/ $/gm", "");
            return Regex.Replace(ArticleText, @"/^\s*|\s*$/g", "");
        }
 
        //entities that should never be unicoded
        private string noUnicodify(string ArticleText)
        {
            ArticleText = ArticleText.Replace(" &amp; ", " & ");
            ArticleText = ArticleText.Replace("&amp;", "&amp;amp;");
            ArticleText = ArticleText.Replace("&amp;lt;", "&amp;amp;lt;");
            ArticleText = ArticleText.Replace("&amp;gt;", "&amp;amp;gt;");
            ArticleText = ArticleText.Replace("&amp;quot;", "&amp;amp;quot;");
            ArticleText = ArticleText.Replace("&amp;apos;", "&amp;amp;apos;");
            ArticleText = ArticleText.Replace("&minus;", "&amp;minus;");
            ArticleText = ArticleText.Replace("&times;", "&amp;times;");
 
            ArticleText = ArticleText.Replace("&nbsp;", "&amp;nbsp;");
            ArticleText = ArticleText.Replace("&thinsp;", "&amp;thinsp;");
            ArticleText = ArticleText.Replace("&shy;", "&amp;shy;");
            ArticleText = ArticleText.Replace("&prime;", "&amp;prime;");
            ArticleText = ArticleText.Replace("/&(#0?9[13];)/", "&amp;$1");
            ArticleText = ArticleText.Replace("/&(#0?12[345];)/", "&amp;$1");
 
            return ArticleText;
        }