User:Wizzy/badwords
Appearance
#! /usr/bin/perl use strict; use File::Find (); use English; if ($#ARGV != 1) { print "Usage: $0 <wikipedia directory> <badwords file>\n"; exit 1; } my @dirs = ($ARGV[0]); my $badwordsfile = $ARGV[1]; open(BADWORDS, $ARGV[0]) || die("can't open badwords file $ARGV[0]: $!"); my @badwords = <BADWORDS>; chomp(@badwords); my %count; sub wanted { if (/^.*html\z/s) { my $title; my $endhead = 0; open(FILE, $_) || die ("Can't open file $_: $!"); while (my $line = <FILE>) { if ($line =~ m%<title>(.*)- Wikipedia, the free encyclopedia</title>%) { $title = $1; } if ($line =~ m%</head>%) { $endhead = 1; } next if ! $endhead; study ($line); for my $badword (@badwords) { if ($badword =~ m:^/(.*)/$:) { my $pattern = $1; next if ($title =~ /\b$pattern\b/); # skip this if it matches the title if ($line =~ /\b$pattern\b/) { my $prematch = substr($PREMATCH, -15); my $postmatch = substr($POSTMATCH, 0, 15); print "<$prematch:$MATCH:$postmatch>\t$title\n"; $count{$badword}++; } } else { next if ($title =~ /\b\Q$badword\b/); if ($line =~ /\b\Q$badword\b/) { my $prematch = substr($PREMATCH, -15); my $postmatch = substr($POSTMATCH, 0, 15); print "<$prematch:$badword:$postmatch>\t$title\n"; $count{$badword}++; } } } } } } # for the convenience of &wanted calls, including -eval statements: use vars qw/*name *dir *prune/; *name = *File::Find::name; *dir = *File::Find::dir; *prune = *File::Find::prune; # Traverse desired filesystems File::Find::find({wanted => \&wanted}, @dirs); print "===================================\n"; foreach my $key (sort { $count{$a} <=> $count{$b} } keys %count) { print "$count{$key}\t$key\n"; } exit;