User:HighInBC/notes
Appearance
Homoglyph testing
[edit]- Please note that I know this code is in a terrible state, I am just prototyping it and it is not even near presentable yet. Chillum 18:23, 7 September 2009 (UTC)
#!/usr/bin/perl use strict; use utf8; use LWP::Simple; use Data::Dumper; my $test_string = lc('S0opa d3wd (0)ri \/\/|-|3@|Z'); my $target_word = lc('on wheels'); my $data = get('http://en.wikipedia.org/wiki/User:HBC_NameWatcherBot/Homoglyphs?action=raw')."\n"; utf8::upgrade($data); my @lines = split("\n", $data); my @homoglyphs; LINE: foreach my $line (@lines) { next LINE unless ($line =~ m|^;(.+?):\s*(.+)$|); my $symbol = $1; my @matches = split(/\s+/,$2); for (@matches) {$_ = quotemeta($_)}; push (@homoglyphs, $symbol, \@matches) } #print Dumper(\@homoglyphs); #__END__ print "Comparing '$test_string' against '$target_word' taking into account homogylphs\n\n"; my $target_pattern = $target_word; while (scalar(@homoglyphs)) { my ($str, $ra_values) = (shift(@homoglyphs),shift(@homoglyphs)); next unless (scalar(@{$ra_values})); my $pattern = ('('.join('|', $str,@{$ra_values}).')'); print "$str -> $pattern\n"; $target_pattern =~ s"$str"$pattern"ig; #" print "\t$target_pattern\n"; } if ($test_string =~ m"($target_pattern)"i) { print "\n\tMatch: '$1' matches '$target_word'.\n"; }
Result:
Comparing 's0opa d3wd 0ri \/\/|-|3&|z' against 'on wheels' taking into account homogylphs ee -> (\Qee\E|\Qea\E) on wh(ee|ea)ls aa -> (\Qaa\E|\Qee\E) on wh(ee|ea)ls a -> (\Qa\E|\Q4\E|\Qci\E|\Qaye\E|\Qci\E|\Q/\\\E|\Q/-\\\E|\Q\@\E|\Q∂\E|\Qλ\E|\Q\&\E) on wh(ee|e(a|4|ci|aye|ci|\/\\|\/\-\\|\@|∂|λ|\&))ls b -> (\Qb\E|\Q13\E|\Q!3\E|\Q8\E|\Qß\E|\Q13\E|\QI3\E|\Q|3\E|\QP>\E|\Q|:\E|\Q!3\E|\Q(3\E|\Q/3\E|\Q\\3\E|\Q)3\E|\Q[3\E|\Q]3\E) on wh(ee|e(a|4|ci|aye|ci|\/\\|\/\-\\|\@|∂|λ|\&))ls c -> (\Qc\E|\Qk\E) on wh(ee|e(a|4|(c|k)i|aye|(c|k)i|\/\\|\/\-\\|\@|∂|λ|\&))ls d -> (\Qd\E|\Qcl\E) on wh(ee|e(a|4|(c|k)i|aye|(c|k)i|\/\\|\/\-\\|\@|∂|λ|\&))ls e -> (\Qe\E|\Q3\E) on wh((e|3)(e|3)|(e|3)(a|4|(c|k)i|ay(e|3)|(c|k)i|\/\\|\/\-\\|\@|∂|λ|\&))ls f -> (\Qf\E|\Qph\E) on wh((e|3)(e|3)|(e|3)(a|4|(c|k)i|ay(e|3)|(c|k)i|\/\\|\/\-\\|\@|∂|λ|\&))ls g -> (\Qg\E|\Qcj\E) on wh((e|3)(e|3)|(e|3)(a|4|(c|k)i|ay(e|3)|(c|k)i|\/\\|\/\-\\|\@|∂|λ|\&))ls h -> (\Qh\E|\Q|-|\E) on w(h|\|\-\|)((e|3)(e|3)|(e|3)(a|4|(c|k)i|ay(e|3)|(c|k)i|\/\\|\/\-\\|\@|∂|λ|\&))ls i -> (\Qi\E|\Ql\E|\Q1\E|\Q!\E|\Q|\E) on w(h|\|\-\|)((e|3)(e|3)|(e|3)(a|4|(c|k)(i|l|1|\!|\|)|ay(e|3)|(c|k)(i|l|1|\!|\|)|\/\\|\/\-\\|\@|∂|λ|\&))ls k -> (\Qk\E|\Qc\E) on w(h|\|\-\|)((e|3)(e|3)|(e|3)(a|4|(c|(k|c))(i|l|1|\!|\|)|ay(e|3)|(c|(k|c))(i|l|1|\!|\|)|\/\\|\/\-\\|\@|∂|λ|\&))ls l -> (\Ql\E|\Qi\E|\Q1\E|\Q!\E|\Q|\E) on w(h|\|\-\|)((e|3)(e|3)|(e|3)(a|4|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|ay(e|3)|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|\/\\|\/\-\\|\@|∂|λ|\&))(l|i|1|\!|\|)s m -> (\Qm\E|\Qrn\E) on w(h|\|\-\|)((e|3)(e|3)|(e|3)(a|4|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|ay(e|3)|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|\/\\|\/\-\\|\@|∂|λ|\&))(l|i|1|\!|\|)s n -> (\Qn\E|\Qri\E) o(n|ri) w(h|\|\-\|)((e|3)(e|3)|(e|3)(a|4|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|ay(e|3)|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|\/\\|\/\-\\|\@|∂|λ|\&))(l|i|1|\!|\|)s o -> (\Qo\E|\Q0\E|\Q()\E|\Q(0)\E) (o|0|\(\)|\(0\))(n|ri) w(h|\|\-\|)((e|3)(e|3)|(e|3)(a|4|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|ay(e|3)|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|\/\\|\/\-\\|\@|∂|λ|\&))(l|i|1|\!|\|)s q -> (\Qq\E|\Q0\E) (o|0|\(\)|\(0\))(n|ri) w(h|\|\-\|)((e|3)(e|3)|(e|3)(a|4|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|ay(e|3)|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|\/\\|\/\-\\|\@|∂|λ|\&))(l|i|1|\!|\|)s s -> (\Qs\E|\Q\$\E|\Q5\E|\Qz\E) (o|0|\(\)|\(0\))(n|ri) w(h|\|\-\|)((e|3)(e|3)|(e|3)(a|4|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|ay(e|3)|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|\/\\|\/\-\\|\@|∂|λ|\&))(l|i|1|\!|\|)(s|\$|5|z) u -> (\Qu\E|\Q|_|\E) (o|0|\(\)|\(0\))(n|ri) w(h|\|\-\|)((e|3)(e|3)|(e|3)(a|4|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|ay(e|3)|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|\/\\|\/\-\\|\@|∂|λ|\&))(l|i|1|\!|\|)(s|\$|5|z) v -> (\Qv\E|\Q\\/\E) (o|0|\(\)|\(0\))(n|ri) w(h|\|\-\|)((e|3)(e|3)|(e|3)(a|4|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|ay(e|3)|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|\/\\|\/\-\\|\@|∂|λ|\&))(l|i|1|\!|\|)(s|\$|5|z) w -> (\Qw\E|\Qvv\E|\Q\\/\\/\E) (o|0|\(\)|\(0\))(n|ri) (w|vv|\\\/\\\/)(h|\|\-\|)((e|3)(e|3)|(e|3)(a|4|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|ay(e|3)|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|\/\\|\/\-\\|\@|∂|λ|\&))(l|i|1|\!|\|)(s|\$|5|z) x -> (\Qx\E|\Q\%\E) (o|0|\(\)|\(0\))(n|ri) (w|vv|\\\/\\\/)(h|\|\-\|)((e|3)(e|3)|(e|3)(a|4|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|ay(e|3)|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|\/\\|\/\-\\|\@|∂|λ|\&))(l|i|1|\!|\|)(s|\$|5|z) z -> (\Qz\E|\Q2\E) (o|0|\(\)|\(0\))(n|ri) (w|vv|\\\/\\\/)(h|\|\-\|)((e|3)(e|3)|(e|3)(a|4|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|ay(e|3)|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|\/\\|\/\-\\|\@|∂|λ|\&))(l|i|1|\!|\|)(s|\$|5|(z|2)) 1 -> (\Q1\E|\Qi\E|\Ql\E|\Q!\E) (o|0|\(\)|\(0\))(n|ri) (w|vv|\\\/\\\/)(h|\|\-\|)((e|3)(e|3)|(e|3)(a|4|(c|(k|c))(i|(l|i|(1|i|l|\!)|\!|\|)|(1|i|l|\!)|\!|\|)|ay(e|3)|(c|(k|c))(i|(l|i|(1|i|l|\!)|\!|\|)|(1|i|l|\!)|\!|\|)|\/\\|\/\-\\|\@|∂|λ|\&))(l|i|(1|i|l|\!)|\!|\|)(s|\$|5|(z|2)) 2 -> (\Q2\E|\Qz\E) (o|0|\(\)|\(0\))(n|ri) (w|vv|\\\/\\\/)(h|\|\-\|)((e|3)(e|3)|(e|3)(a|4|(c|(k|c))(i|(l|i|(1|i|l|\!)|\!|\|)|(1|i|l|\!)|\!|\|)|ay(e|3)|(c|(k|c))(i|(l|i|(1|i|l|\!)|\!|\|)|(1|i|l|\!)|\!|\|)|\/\\|\/\-\\|\@|∂|λ|\&))(l|i|(1|i|l|\!)|\!|\|)(s|\$|5|(z|(2|z))) 3 -> (\Q3\E|\QE\E) (o|0|\(\)|\(0\))(n|ri) (w|vv|\\\/\\\/)(h|\|\-\|)((e|(3|E))(e|(3|E))|(e|(3|E))(a|4|(c|(k|c))(i|(l|i|(1|i|l|\!)|\!|\|)|(1|i|l|\!)|\!|\|)|ay(e|(3|E))|(c|(k|c))(i|(l|i|(1|i|l|\!)|\!|\|)|(1|i|l|\!)|\!|\|)|\/\\|\/\-\\|\@|∂|λ|\&))(l|i|(1|i|l|\!)|\!|\|)(s|\$|5|(z|(2|z))) 5 -> (\Q5\E|\Qs\E) (o|0|\(\)|\(0\))(n|ri) (w|vv|\\\/\\\/)(h|\|\-\|)((e|(3|E))(e|(3|E))|(e|(3|E))(a|4|(c|(k|c))(i|(l|i|(1|i|l|\!)|\!|\|)|(1|i|l|\!)|\!|\|)|ay(e|(3|E))|(c|(k|c))(i|(l|i|(1|i|l|\!)|\!|\|)|(1|i|l|\!)|\!|\|)|\/\\|\/\-\\|\@|∂|λ|\&))(l|i|(1|i|l|\!)|\!|\|)(s|\$|(5|s)|(z|(2|z))) 0 -> (\Q0\E|\Qo\E|\Q()\E|\Q(0)\E) (o|(0|o|\(\)|\(0\))|\(\)|\((0|o|\(\)|\(0\))\))(n|ri) (w|vv|\\\/\\\/)(h|\|\-\|)((e|(3|E))(e|(3|E))|(e|(3|E))(a|4|(c|(k|c))(i|(l|i|(1|i|l|\!)|\!|\|)|(1|i|l|\!)|\!|\|)|ay(e|(3|E))|(c|(k|c))(i|(l|i|(1|i|l|\!)|\!|\|)|(1|i|l|\!)|\!|\|)|\/\\|\/\-\\|\@|∂|λ|\&))(l|i|(1|i|l|\!)|\!|\|)(s|\$|(5|s)|(z|(2|z))) Match: '0ri \/\/|-|3&|z' matches 'on wheels'.