User:Interiot/Tool/code

#!/usr/bin/perl 
# License: [[Public domain]]

# regression test cases:
#       Markadet        fr.wikipedia.org        (11k edits)
#       Kolja21         de.wikipedia.org        (5.1k edits)
#       OldakQuill      en.wikipedia.org        (12k edits)
#       Mxn             meta.wikimedia.org      (1.7k edits)
#       Helios89        it.wikipedia.org        (7k edits)

# TODO:
#   - regarding the "403 access denied" problem, contact noc@wikimedia.org or #wikimedia-tech on freenode
#           - ahh, they actively block screen-scrapers
#           - sweet-talk Hashar or Dom into unblocking, temporarily disable the tool or enable some form of rate limiting, etc.
#   - add a starting-cutoff-date, so renominations for RfA could only include the most recent items
#           - add a # edits per day
#   - use something like this to retrieve the list of namespaces in real-time:
#           http://en.wikiquote.org/wiki/Special:Export/Main_Page
#   - make "minor" actually work well for editcountitis:
#           - eg. for each namespace, present it like:          Category Talk: 23 (13)
#             where "23" is the non-minor edits, and "13" is the minor edits
#   - get it to work with other mediawikis (example: http://brandt-watch.org/bwwiki/Main_Page)
#   - include a date at the end of the top-15 breakdown
#   - change the <div>s to <td>s on graph bars
#   - don't count comments as having an edit summary when it's purely an autocomment
#   - fix the issue where there's an "extra" first result when $offset > 0 
#
#   - REWRITE IN AJAX so we don't have to worry about it being a temporary solution or not
#   - fix the sorting order on the output
#
#   - ??        http://tools.wikimedia.de/~avar/cgi-bin/count


# Possible other analysis graphs:
#   - monthly breakdowns
#           : have all the monthly breakdowns appear in one space on the page, but allow the user to
#             select between them with Javascript
#       - monthly breakdown of major/minor edits (like current red/green...  make major edits on left, with minor edits trailing on right)
#       - monthly breakdown of the number of edits with summaries of /^(rv|revert)/
#       - monthly breakdown, one each for the separate namespaces
#       - on monthly breakdowns, extrapolate the current month forward
#   - allow the user to hit ''(more)'' at the bottom of the namespace breakdowns, allowing them to
#           see a more complete list of top-15
#   - allow the user to restrict the metrics to some specific recent period...  eg. this is
#           something that's sometimes discussed on RfA
#   - any content-based analyses?  (I suppose one would have to know which SQL thingies are quicker than others)

# semi-far-out:
#   - allow the user to see JUST their edits from a specific page, when they click on that page on
#       the top-15 breakdown (furthermore, if structured right, it might let anybody's tool basically to
#       pop up the results of a $user && $page query)
#   - allow the results to be the combination of multiple users (either logged-in-user + anon-IP,
#       and multiple logged-in-users from multiple sites, eg. meta)

use strict;
use warnings;


use CGI;
#use CGI::Carp qw(fatalsToBrowser);
use Date::Parse;
use LWP::Simple;
use HTML::Entities;
use Data::Dumper;

sub LOGFILE {"/home/interiot/public_html/tmp/wannabe_kate.log"}

if ($ENV{QUERY_STRING} eq "code") {     # send ourself when requested
    open FIN, $0 and print "Content-type: text/plain\n\n", <FIN>;
    exit;
}

# fill out using these documents:
#       http://meta.wikimedia.org/wiki/MediaWiki_localisation#Getting_latest_file
#       http://sourceforge.net/docs/E04/
        sub nmspc {
            my @a = map {s/#.*//; s/^\s+|\s+$//g; $_} grep /\S/, split /[\n\r]+/, shift;
            return { "\x00order" => [@a], map { $_,1} @a};
        }

my %valid_namespaces = (
    'en.wikipedia.org' => nmspc(qq[
        Talk:
        Category talk:
        Category:
        Help:
        Help talk:
        Image:
        Image talk:
        MediaWiki:
        MediaWiki talk:
        Portal:
        Portal talk:
        Template:
        Template talk:
        User:
        User talk:
        Wikipedia:
        Wikipedia talk:
    ]),
    'de.wikipedia.org' => nmspc(qq[
        Diskussion:                     # Talk
        Kategorie:                      # Category:
        Kategorie Diskussion:           # Category Talk:
        Hilfe:                          # Help:
        Hilfe Diskussion:               # Help Talk:
        Bild:                           # Image:
        Bild Diskussion:                # Image Talk:
        MediaWiki:                      # MediaWiki:
        MediaWiki Diskussion:           # MediaWiki Talk:
        Portal:                         # Portal:
        Portal Diskussion:              # Portal Talk:
        Vorlage:                        # Template:
        Vorlage Diskussion:             # Template Talk:
        Benutzer:                       # User:
        Benutzer Diskussion:            # User Talk:
        Wikipedia:                      # Wikipedia:
        Wikipedia Diskussion:           # Wikipedia Talk:
    ]),
    'it.wikipedia.org' => nmspc(qq[
        Discussione                     # Talk:
        Categoria                       # Category:
        Discussioni categoria           # Category Talk:
        Aiuto                           # Help:
        Discussioni aiuto               # Help Talk:
        Immagine                        # Image:
        Discussioni immagine            # Image Talk:
        MediaWiki                       # MediaWiki:
        Discussioni MediaWiki           # MediaWiki Talk:
        Template                        # Template:
        Discussioni template            # Template Talk:
        Utente                          # User:
        Discussioni utente              # User Talk:
        Wikipedia                       # Wikipedia:
        Discussioni Wikipedia           # Wikipedia Talk:
    ]),
);

my $query = new CGI;
my $site = $query->param("site");
my $username = CGI::Util::escape($query->param("username"));
    $username =~ s/[\+\s]/_/g;
my $isvalid = 0;
my $this_namespace;

if ($ENV{QUERY_STRING}) {
    $isvalid = 1;
    $isvalid = 0 unless ($site =~ /^[\w\.]*\.(org|com|net)$/i);
    #$isvalid = 0 unless ($username =~ /^[-\w\._]*$/);
    $isvalid = 0 if (length($username) == 0);
}


# data we generate by parsing the output from Wikipedia
my @urls;
my $bandwidth_down = 0;
my %namespace_totals;
my $xml_lang = "";
my $earliest_perldate;
my $latest_perldate;
my %month_totals;
my %month_editsummary_totals;
my %unique_articles;
my %namespace_unique_articles;
my %article_titles;

print "Content-type: text/html; charset=utf-8\n\n";
#cgi_dumper(\%valid_namespaces);
if (!$isvalid) {
    if ($ENV{QUERY_STRING}) {
        print "<font color=red><b>Invalid value</b></font>.  <a href='http://en.wikipedia.org/wiki/Special:Emailuser/Interiot'>email Interiot</a> if this is incorrect.<p><br><br>\n";
    }

    print <<"EOF";
This is a slow substitute for <a
href="http://en.wikipedia.org/wiki/Wikipedia:Kate%27s_Tool">Kate's Tool</a> when it's unavailable.

<form method=GET style="padding-top:1em">
<table><tr><td>username <td><input maxlength=128 name=username value="" title="username">
       <tr><td>site     <td><input maxlength=128 name=site value="en.wikipedia.org" title="site">
       <tr><td>         <td><input type=submit value="Submit">
</table>
</form>

Notes:
<ul>
<li>Green bars are for edit summaries, red bars are for edits with no summaries
<li>The statistics are real-time (it <a href="http://en.wikipedia.org/wiki/Screen_scraping">scrapes</a> data off of the <tt>Special:Contributions</tt> page while you wait).
<li>It's somewhat slow for edit counts over 5000
<li>It's unable to count deleted edits
<li>It should work with most wikis out there that use <a href="http://en.wikipedia.org/wiki/MediaWiki">MediaWiki</a>, since it doesn't need privileged access to the databases.
<!-- <li>This can't be more than a temporary solution for Wikipedia, as it wastes ~1GB/day of extra bandwidth compared to Kate's -->
<li>Source code is in the <a href="http://en.wikipedia.org/wiki/Public_domain">public domain</a> and available <a href="$ENV{SCRIPT_NAME}?code">here</a>
<li>Warning: <a href="http://www.bbc.co.uk/dna/h2g2/A1091350">metrics are evil</a>
</ul>

For bug reports/comments, see <a href="http://en.wikipedia.org/wiki/User_talk:Interiot">User talk:Interiot</a> or <a href="http://en.wikipedia.org/wiki/Special:Emailuser/Interiot">email him</a>.
EOF
} else {
    $this_namespace = $valid_namespaces{lc $site};
    #cgi_dumper(\$this_namespace); exit;

    $username =~ s/^_+|_$//g;

    #print "$site<br>$username\n";
    $namespace_totals{earliest} = get_5000($site, $username, 0);
    #cgi_dumper(\@urls, \%namespace_totals); exit;
    #cgi_dumper(\%unique_articles);
    $namespace_totals{"number of unique articles"} = scalar(keys %unique_articles);
    $namespace_totals{"avg edits per article"} = sprintf("%5.2f", $namespace_totals{total} / $namespace_totals{"number of unique articles"});

    print $xml_lang, <<'EOF';
<head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <style>
        td {padding: .1em 1em .1em}

        table.months {padding-top: 2em}
        table.months td.date {font-weight: bold}
        table.months td {font-size: 75%}

        div.red, div.green {
            height:1em;
            float:left;
        }
        div.red {background-color: #f00}
        div.green {background-color: #0f0}

        div.topN {
            float: left;
            min-height: 30em;           /* otherwise, they get ALL jumbled up */
        }
        table.topN {
            float: left;
            border: 1px solid black;
        }
        table.topN th {
            background-color: #000;
            color: #fff;
        }
        table.topN td {
            /* override the above */
            padding: .1em .3em;
        }
    </style>
</head>
<body>
EOF


    print "<ul style='padding-left:10em'><a href='$ENV{SCRIPT_NAME}'><i>Go back</i></a> to see caveats or to check another user.</ul>\n";
    print "<h3>User:$username</h3>\n";
    print "<table>\n";
    foreach my $key (sort  keys %namespace_totals) {
        print "<tr><td>", $key, "<td>", $namespace_totals{$key}, "\n";
    }
    print "</table>\n";

       
    #### output the months stats
    #cgi_dumper(\%month_editsummary_totals);
    my @months = list_months();
    my $max_width = 0;
    $max_width = ($_ > $max_width ? $_ : $max_width) foreach (values %month_totals);
    if ($max_width > 0) {
        print "<table class=months>\n";
        foreach my $month (@months) {
            my $no_summary = $month_totals{$month} - $month_editsummary_totals{$month};

            print "<tr><td class=date>$month   <td>", $month_totals{$month}, "\n";
            #print "<td><div class=red style='width:", int(500 * $month_totals{$month} / $max_width), "px'></div>\n";
            print "<td><div class=green style='width:", int(500 * $month_editsummary_totals{$month} / $max_width), "px'></div>\n";
            print     "<div class=red   style='width:", int(500 * $no_summary / $max_width), "px'></div>\n";
        }
        print "</table>\n";
    }


    #### output the top-15 namespace stats
    my $num_to_present = 15;
    if ($this_namespace) {          # only do it if we're sure about the namespaces
        print "<p><br>\n";
        #print "<ul>NOTE: This section has a tendency to hilight a user's \"youthful indiscretions\".  Please take the dates of the edits into account.</ul>\n";
        foreach my $nmspc ("Mainspace", @{$this_namespace->{"\x00order"}}) {
            next unless %{$namespace_unique_articles{$nmspc}};
            my @articles = sort {$namespace_unique_articles{$nmspc}{$b} <=> $namespace_unique_articles{$nmspc}{$a}}
                            grep { $namespace_unique_articles{$nmspc}{$_} > 1}          # filter out items with only 1 edit
                            keys(%{$namespace_unique_articles{$nmspc}});
            next unless @articles;

            #print "<div class=topN>\n";
            print "<table class=topN><tr><th colspan=2>$nmspc\n";
            my @present = splice(@articles, 0, $num_to_present);
            foreach my $article (@present) {
                my $artname = $article_titles{$article};
                if ($nmspc ne 'Mainspace') {
                    $artname =~ s/^.*?://;
                }
                $artname =~ s/\s/&nbsp;/g;
                my $url = "http://$site/w/index.php?title=$article&action=history";
                print "<tr><td>", $namespace_unique_articles{$nmspc}{$article}, "<td><a href='$url'>$artname</a>\n";
            }

            # fill it out so float:left doesn't jumble up
            foreach (@present..14) {
                print "<tr><td>&nbsp;<td>&nbsp;\n";
            }

            print "</table>\n";
            #print "</div>\n";
        }
    }

    #### output the bottom summary
    print "<p style='clear:left'><br><br>If there were any problems, please <a href='http://en.wikipedia.org/wiki/Special:Emailuser/Interiot'>email Interiot</a> or post at <a href='http://en.wikipedia.org/wiki/User_talk:Interiot'>User talk:Interiot</a>.\n";
    #print "<p>Based on these URLs:\n<ul>\n", join("\n", map {"<li><a href='$_>$_</a>"} @urls), "</ul>\n";
    print "<div style='padding:1em 3em; font-size: 60%'>Based directly on these URLs:\n";
    foreach my $ctr (0..$#urls) {
        print "<a href='$urls[$ctr]'>[", ($ctr+1), "]</a>";
        print ", " unless ($ctr >= @urls - 1);
        print "\n";
    }
    print "</div>\n";

 
    #### log the bandwidth used
    open FOUT, ">>" . LOGFILE() or die;
    printf FOUT "%s  %-20s %-30s  %5dK  %7d\n", scalar(localtime), $username, $site,
        int($bandwidth_down / 1024), $namespace_totals{total};
    close FOUT;
}

sub get_5000 {
    my $site = shift;
    my $username = shift;
    my $offset = shift;

    my $earliest = "";

    my $url = "http://$site/w/index.php?title=Special:Contributions&target=$username&offset=${offset}&limit=5000";
    if (! $LWP::Simple::ua) {
        LWP::Simple::_init_ua();
        #$LWP::Simple::ua->agent("Mozilla/4.0 WebTV/2.6 (compatible; MSIE 4.0)");     # apparently they're picky about useragent strings
        $LWP::Simple::ua->agent("Wget/1.9.1");     # apparently they're picky about useragent strings.  Use the same as wget.
    }
    push(@urls, $url);
    if (@urls >= 10) {
        print "Too many pages fetched.  Terminating.<br>\n";
        #cgi_dumper(\@urls); exit;
    }
    my $page;
    if (1) {
        my $request = HTTP::Request->new(GET => $url);
        my $response = $LWP::Simple::ua->request($request);
        if (!$response->is_success) {
            print "While trying to fetch <a href='$url'>$url</a>, $site responded:<br><br>\n", $response->status_line, "<br><br>", $response->content;
            exit;
        }
        $page = $response->content;
        $bandwidth_down += length($page);
        if (0) {
            local *FOUTOUT;
            open FOUTOUT, ">/var/tmp/kate/tmp.out" or die;
            print FOUTOUT $page;
            close FOUTOUT;
        }
    } else {
        open FININ, "</var/tmp/kate/tmp.out" or die;
        local $/ = undef;
        $page = <FININ>;
        close FININ;
    }

    if ($page =~ /(<html [^>]+>)/i) {
        $xml_lang = $1;
    }

    ## parse each individual contribution
    #while ($page =~ /^<li>(\d\d:\d\d,.*)/igm) {
    while ($page =~ /^<li>([^(]+\(<a href="[^"]+action=history.*)/igm) {
        my $this_time;

        local $_ = $1;

        my $edit_summary;
        #$edit_summary++ if (m#<a href="/wiki/[^"]*"\s+title="[^"]*">[^<]*</a>\s*\(#is);
        $edit_summary++ if (/<span class='comment'>/si);

        my $article_url;
        if (m#<a href="/wiki/([^"]+)" title="[^"]+">([^<]+)#si) {
            $article_url = $1;
            $article_titles{$1} = $2;
        }
        $unique_articles{$article_url}++;

        ## strip out all the HTML tags
        s/<[^>]*>//gs;
            
        if (/^(.*?) \(/) {
            my $date = $1;
            $earliest = $date;

            # translate months into english, so Date::Parse chn handle them
            # languages believed to work here:      EN, DE, IT
            $date =~ s/\b(?:gen                 )\b/jan/gix;
            $date =~ s/\b(?:mÃ¤r                 )\b/mar/gix;
            $date =~ s/\b(?:mai|mag             )\b/may/gix;
            $date =~ s/\b(?:giu                 )\b/jun/gix;
            $date =~ s/\b(?:lug                 )\b/jul/gix;
            $date =~ s/\b(?:ago                 )\b/aug/gix;
            $date =~ s/\b(?:set                 )\b/sep/gix;
            $date =~ s/\b(?:okt|ott             )\b/oct/gix;
            $date =~ s/\b(?:dez|dic             )\b/dec/gix;

            $this_time = str2time($date);
            if ($this_time == 0) {
                #print "XXXXXXXXXXXXXXXXXXXXXXXXX<br>\n";
            } else {
                #print scalar(gmtime($this_time)), "<br>\n";
                $earliest_perldate = $this_time;        # record the earliest and latest month we see
                $latest_perldate ||= $this_time;

                my $monthkey = monthkey(localtime($this_time));
                $month_totals{$monthkey}++;
                $edit_summary && $month_editsummary_totals{$monthkey}++;
            }
        }
        s/^[^()]*\([^()]*\) \([^()]*\) (?:\S )? //;

        my $subspace = "Mainspace";
        if (/^([^\s\d\/:]+(?:\s[^\s\d\/:]+)?:)/) {
            if (!$this_namespace || exists $this_namespace->{$1}) {
                $subspace = $1;
            }
        }
        $namespace_totals{$subspace}++;
        $namespace_totals{total}++;
        $namespace_unique_articles{$subspace}{$article_url}++;

        #print "$_<br>\n";
    }


    ## if they have more than 5000 contributions, go to the next page
    while ($page =~ /href="[^"]+:Contributions[^"]+offset=(\d+)/ig) {
        #print "Trying again at offset $1<br>\n";
        next unless $1 > 0 && ($offset == 0 || $1 < $offset);
        return get_5000($site, $username, $1);      # tail recursion until there are no more
    }

    return $earliest;
}

# returns something like [
#       "2003/10",
#       "2003/11",
#       "2003,12"
# ]
sub list_months {
    my $last_monthkey = '';
    my @ret;
    # yes, this is a fairly odd algorithm.  oh well.
    for (my $date=$earliest_perldate; $date<=$latest_perldate; $date+=10*24*60*60) {
        my $monthkey = monthkey(localtime($date));
        if ($monthkey ne $last_monthkey) {
            push(@ret, $monthkey);
            $last_monthkey = $monthkey;
        }
    }
    return @ret;
}

sub monthkey {($_[5] + 1900) . "/" . ($_[4] + 1)}

sub cgi_dumper {print "<pre>", HTML::Entities::encode(Dumper(@_)), "</pre>"}