Wikipedia:Persondata/transform.pl

From Wikipedia, the free encyclopedia
Jump to: navigation, search
#!/usr/bin/perl -w
use POSIX;
##########################
# Transform Persondata
##########################
 
# Parse date entry and expand each date field to 8 fields
# Likewise the articles about places and first/last names
 
##########################
 
my %months=(
  January => 1,
  February => 2,
  March => 3,
  April => 4,
  May => 5,
  June => 6,
  July => 7,
  August => 8,
  September => 9,
  October => 10,
  November => 11,
  December => 12
);
 
my %monthname = reverse %months;
 
 
 
my $MONTH  = "(" . join("|",keys %months) . ")";
my $YEAR   = "(\\d{1,4})";
my $PREFIX = "([^0-9]+)";
 
 
# Clean up date entry for standardised persondata date format
sub clean_date {
  my $d = shift;
 
  # Put spaces between wikilinks
  $d =~ s/([\]])([\[])/$1 $2/g;
 
  # Deal with piped links
  $d =~ s/[\[]{2}([^\]]*)[|](.*)[\]]{2}/$2/g;
 
  # Remove square brackets
  $d =~ s/[\[\]]//g;
 
  # Deal with templates
  $d =~ s/{{([Bb]irth|[Dd]eath) date( and age)?\s?\|\s?(\d{1,4})\|\s?(\d{1,2})\|\s?(\d{1,2})(.*)/$3-$4-$5/g;
 
  # DD.MM.YYYY => DD MMM YYYY
  if ($d =~ /^0?(\d{1,2})\.(\d{1,2})\.(\d{1,4})$/) {
    my $month = int $2;
    $d = "$1 ".$monthname{$month}." $3";
  }
 
  # YYYY-MM-DD => DD MMM YYYY
  if ($d =~ /^(\d{1,4})-(\d{1,2})-0?(\d{1,2})$/) {
    my $month = int $2;
    $d = "$3 ".$monthname{$month}." $1";
  }
 
  # AD/CE is implicit
  $d =~ s/A\. ?D\.//; # TODO: doesn't completely work?? e.g. [[Libanios]]
  $d =~ s/A ?D//; 
  $d =~ s/\bC\. ?E\.//;
  $d =~ s/\bC ?E//; 
 
  # Remove trailing punctuation
  $d =~ s/(,|\||;|=)$//;
 
  # Remove HTML comments
  $d =~ s/<!--(.*)-->//;
 
  # Add forgotten spaces
  $d =~ s/([a-z])(\d)/$1 $2/g;
  $d =~ s/(\d)([a-z])/$1 $2/g;
  $d =~ s/\.([^ ])/. $1/g;
 
  #Remove spaces from ordinals
  $d =~ s/(\d) (th|st|nd|rd|s\b)/$1$2/g;
 
  # remove double spaces
  $d =~ s/\s+/ /g;
 
  # remove spaces at beginning and end
  $d =~ s/^\s+//;
  $d =~ s/\s+$//;
 
  # Fields with only question marks => "unknown"
  $d =~ s/^\?$/unknown/;
 
  # Uniform capitalisation
  $d =~ s/Unknown/unknown/g;
  $d =~ s/After/after/g;
 
  # question marks should always have brackets and a space before them
  $d =~ s/\(\?\)/\?/g;
  $d =~ s/([^ ])\?+/$1 \?/g;
  $d =~ s/\?/(\?)/g;
 
  # Instead of a question mark at the end, 'probably' at the beginning
  $d =~ s/^(.+) \(\?\)$/probably $1/;
  $d =~ s/^\(\?\)$/probably/;
 
  # Slash without space
  $d =~ s/ ?\/ ?/\//g;
 
  # Remove bolding/italics
  $d =~ s/\'{2,5}//g;
 
  # Shortened month names
  $d =~ s/Jan[\.\s]+/January /; 
  $d =~ s/Feb[\.\s]+/February /; 
  $d =~ s/Mar[\.\s]+/March /;
  $d =~ s/Apr[\.\s]+/April /;  
  $d =~ s/Jun[\.\s]+/June /;  
  $d =~ s/Jul[\.\s]+/July /;  
  $d =~ s/Aug[\.\s]+/August /;
  $d =~ s/Sep[\.\s]+/September /;
  $d =~ s/Sept[\.\s]+/September /;
  $d =~ s/Oct[\.\s]+/October /;
  $d =~ s/Nov[\.\s]+/November /;
  $d =~ s/Dec[\.\s]+/December /;
 
  # Write out "Century"
  $d =~ s/C\./century/g;
 
  # "End of the 5th century" => "End 5th century" (simpler)
  $d =~ s/ of the//;
 
  $d =~ s/(approx\.?|[Cc]irca\.?\s?|\bca?[\.\s]+|about|around|~)/circa /; 
 
  $d =~ s/([Pp]ossibly|[Pp]robably)/probably/g;
 
  $d =~ s/([Ss]till )?[Ll]iving( [Pp]erson)?//;
  $d =~ s/[Nn]\/?[Aa]//;
  $d =~ s/[Nn]ot [Aa]pplicable//;
 
  # Remove double spaces
  $d =~ s/\s+/ /g;
 
 
  #----------Error fixing---------
 
  # Other typing errors
  #$d =~ s/^Um/um/;
  #$d =~ s/chr/Chr/;
 
  return $d;
}
 
sub ilog10 { $x=shift; return int log10($x); }  
 
my @fields = ( "day", "month", "year", "century", "decade", "year1", "year2", "note" );
 
sub parse_date {    
  my $date = shift;
  my %d;
 
  if (! $date ) {
    $d{"note"} = "";
 
  # Normal date entry
  } elsif ($date =~ /^$PREFIX?(\d+ )?$MONTH,? $YEAR( BCE?\.?)?$/) {        
    $d{note} = $1 if defined $1;
    $d{day}    = substr($2,0,-1) if defined $2;
    if (defined $3) {
      $d{month} = $months{$3};
    }  
    $d{year}   = int $4;
    $d{year} = -$d{year} if defined $5;  # B.C.
 
  } elsif ($date =~ /^$PREFIX?$MONTH (\d+)?,?\s?$YEAR( BCE?\.?)?$/) {        
    $d{note} = $1 if defined $1;
    $d{day}    = substr($3,0) if defined $3;
    if (defined $2) {
      $d{month} = $months{$2};
    }  
    $d{year}   = int $4;
    $d{year} = -$d{year} if defined $5;  # B.C.
 
  } elsif ($date =~ /^$PREFIX?$YEAR( BCE?\.?)?$/) {
    $d{note} = trim($1) if defined $1;
    $d{year}   = int $2;    
    $d{year} = -$d{year} if defined $3;  # B.C.    
 
  # Century
  #} elsif ($date =~ /^(around |probably |[Bb]eginning |[Mm]iddle |[Ee]nd )?(\d{1,2})\. [Cc]entury( BCE?\.?)?$/) {
  } elsif ($date =~ /^$PREFIX?(\d{1,2})(\.|th|st|nd|rd) [Cc]entury( BCE?\.?)?$/) {
    $d{note} = trim($1) if defined $1;
    $d{century} = $2;
    $d{century} = -$d{century} if defined $4;
 
  # Decade
  } elsif ($date =~ /^(\d{1,4}) ?'?s( BCE?\.?)?$/) {
    $d{decade} = (int $1 / 10) * 10;
    $d{decade} = -$d{decade} if defined $3;    
    $d{century} = int ($d{decade} / 10) + 1;
 
  } elsif ($date =~ /^$PREFIX?$YEAR or $YEAR$/) { # TODO: or != until, TODO: B.C.
    $d{note} = trim($1) if defined $1;
    $d{year1}  = int $2;
    $d{year2}  = int $3;
 
  } elsif ($date =~ /^$PREFIX?$YEAR\/(\d{1,4})$/) { # TODO: B.C.
    $d{note} = trim($1) if defined $1;
    $d{year1}  = int $2;
    $d{year2}  = int $3;
 
    # Example: 1632/33 => 1632/1633
    $c = ilog10($d{year1}) - ilog10($d{year2});
    if ( $c < 0 ) { # Errr
      $d{year1} = "";
      $d{year2} = "";
      $d{note} = $date;     
    } else {
      $d{year2} = substr($d{year1},0,$c) . $d{year2};
    }   
 
  # Interval of several years  # TODO: not tested!
  } elsif ( $date =~ /^$PREFIX?between $YEAR( and )$YEAR( BCE?\.?)?$/ or
            $date =~ /^$PREFIX?$YEAR( to |-)$YEAR$( BCE?\.?)?/ )
  {
    $d{note} = trim($1) if defined $1;
 
    $d{year1}  = int $2;
    $d{year2}  = int $4;
 
    if (defined $5) {
      ($d{year1}, $d{year2}) = (-$d{year2}, -$d{year1});
    }   
 
  #} elsif ($date =~ /^$PREFIX?$YEAR( to |-)$YEAR$/) {
  #  $d{note} = trim($1) if defined $1;
  #  $d{year1}  = int $2;
  #  $d{year2}  = int $4;
 
  } else {    
    $d{note} = $date; 
  }
 
  if (defined $d{year1} and defined $d{year2}) {
    # if years need to be switched
    #if ($d{year1} > $d{year2}) {
    #  ($d{year1}, $d{year2}) = ($d{year2}, $d{year1});
    #}  
 
    # if century can be defined
    if ( (int $d{year1} / 100) eq (int $d{year2} / 100)) {
      $d{century} = (int $d{year1} / 100) + 1;
    }      
 
    # if decade can be defined    
    if ( $d{century} and (int $d{year1} / 10) eq (int $d{year2} / 10)) {
      $d{decade} = (int $d{year1} / 10) * 10;
    }
 
  }
 
  # calculate decade and century
  if (defined $d{year}) {      
    $d{decade} = (int $d{year} / 10) * 10;
  }
 
  # calculate century
  if (defined $d{decade}) {
    if( $d{decade} != 0 ){
      $d{century} = (int $d{decade} / 100) + (int $d{decade} / abs $d{decade});
    }
    else{
      if (defined $d{year}) {      
        $d{century} = (int $d{decade} / 100) + (int $d{year} / abs $d{year});
      }
      else{
      }
    }
  }  
 
  # Initialise undefined fields
  foreach my $f (@fields) {
     $d{$f} = '' if not defined $d{$f};
  }
 
  return %d;
}
 
sub parse_location {
  my $p = trim(shift);
 
  # [[a]] or [[b]] <- so what is it?! 
 
  # [[...]]
  # [[...]], ...
  # [[...]] , ...
  # [[...]]/ ... <- preferably not like this
  if ($p =~ /^\[\[([^\]]+)\]\].*?$/) {
    my $a = $1;    
    $a =~ s/\|.*$//;        
    #print "$p|$a\n";
    return $a;
  } else {
    #print "!$p\n";
    return "";     
  }  
}
 
#For PND number (assigned to German-speaking authors).
#Small number of articles on en wiki have this
sub parse_pnd {
  my $p = trim(shift);
 
  my $pnd_nr = "";
  my $pnd_date = "";
 
  if ($p =~ /^(\d)(\d)(\d)(\d)(\d)(\d)(\d)(\d)([\dX])/) {    
    my $check = $9; 
    $check = 10 if $check eq "X";
    # pruefziffer    
    if ( ((2*$1+3*$2+4*$3+5*$4+6*$5+7*$6+8*$7+9*$8) % 11) eq $check ) {
      # TODO: Number range 10000000 bis 14999999
      if ($1 == 1 && $2>=0 && $2<=4) {
        $pnd_nr = "$1$2$3$4$5$6$7$8$9";
      }  
    }  
 
  } 
 
  if ($p =~ /(\d){1,2}\.(\d{1,2})\.(\d\d\d\d)/) {
    $pnd_date = "$3-$2-$1"; 
  } 
 
  return ($pnd_nr, $pnd_date);
}
 
 
sub unbracket {
  my $p = shift;
 
  # Insert missing spaces between wikilinks
  $p =~ s/([\]])([\[])/$1 $2/g;
 
  # remove square brackets
  $p =~ s/[\[\]]//g;
 
  return $p;
}  
 
sub trim {
  my $p = shift;
 
  if ($p) {
    # remove empty spaces at beginning and end
    $p =~ s/^\s+//;
    $p =~ s/\s+$//;
  }  
 
  return $p;
}
 
##########################
 
while(my $line = <>) {
  my @pd = split("\t",$line);
  chop($pd[-1]); # remove end-of-line
 
  my @pd_transformed;
  $pd_transformed[0] = trim($pd[0]); # pd_id
  $pd_transformed[1] = trim($pd[1]); # pd_article
  $pd_transformed[2] = trim($pd[2]); # pd_name
  $pd_transformed[3] = trim($pd[3]); # pd_alternative
  $pd_transformed[4] = trim($pd[4]); # pd_description
  $pd_transformed[5] = unbracket(trim($pd[5])); # pd_born  
  $pd_transformed[6] = trim($pd[6]); # pd_born_in
  $pd_transformed[7] = unbracket(trim($pd[7])); # pd_died
  $pd_transformed[8] = trim($pd[8]); # pd_died_in
  $pd_transformed[9] = trim($pd[9]); # pd_pnd
 
  # extract checked pnd-nr and additional date
  ($pd_transformed[10], $pd_transformed[11]) = parse_pnd($pd[9]); # pnr_nr, pnd_date
 
  if ( trim($pd[2]) =~ /^([^,]+),([^,]+)$/ ) {
    $pd_transformed[12] = trim($2); # n_given
    $pd_transformed[13] = trim($1); # n_surname
    $pd_transformed[14] = ''; # n_suffix
  }
  elsif ( trim($pd[2]) =~ /^([^,]+),([^,]+),([^,]+)$/ ) {
    $pd_transformed[12] = trim($2); # n_given
    $pd_transformed[13] = trim($1); # n_surname
    $pd_transformed[14] = trim($3); # n_suffix
  }
  else {
    $pd_transformed[12] = ''; # n_given
    $pd_transformed[13] = ''; # n_surname
    $pd_transformed[14] = ''; # n_suffix
  }  
 
  $pd_transformed[15] = parse_location($pd[6]); # b_place
  $pd_transformed[16] = parse_location($pd[8]); # d_place
 
  my %born = parse_date(clean_date($pd[5]));
 
  $pd_transformed[17] = $born{day};     # b_day
  $pd_transformed[18] = $born{month};   # b_month
  $pd_transformed[19] = $born{year};    # b_year
  $pd_transformed[20] = $born{decade};  # b_decade
  $pd_transformed[21] = $born{century}; # b_century
  $pd_transformed[22] = $born{year1};   # b_year1
  $pd_transformed[23] = $born{year2};   # b_year2
  $pd_transformed[24] = $born{note};    # b_note      
 
  my %died = parse_date(clean_date($pd[7]));
 
  $pd_transformed[25] = $died{day};     # d_day
  $pd_transformed[26] = $died{month};   # d_month
  $pd_transformed[27] = $died{year};    # d_year
  $pd_transformed[28] = $died{decade};  # d_decade  
  $pd_transformed[29] = $died{century}; # d_century
  $pd_transformed[30] = $died{year1};   # d_year1
  $pd_transformed[31] = $died{year2};   # d_year2
  $pd_transformed[32] = $died{note};    # d_note
 
  print join("\t",@pd_transformed) . "\n";
 
}