Jump to content

User:HighInBC/notes

From Wikipedia, the free encyclopedia

Homoglyph testing

[edit]
  • Please note that I know this code is in a terrible state, I am just prototyping it and it is not even near presentable yet. Chillum 18:23, 7 September 2009 (UTC)
#!/usr/bin/perl
use strict;
use utf8;
use LWP::Simple;
use Data::Dumper;

my $test_string = lc('S0opa d3wd (0)ri \/\/|-|3@|Z');
my $target_word = lc('on wheels');

my $data = get('http://en.wikipedia.org/wiki/User:HBC_NameWatcherBot/Homoglyphs?action=raw')."\n";
utf8::upgrade($data);
my @lines = split("\n", $data);

my @homoglyphs;
LINE: foreach my $line (@lines) {
 next LINE unless ($line =~ m|^;(.+?):\s*(.+)$|);
 my $symbol = $1;
 my @matches = split(/\s+/,$2);
 for (@matches) {$_ = quotemeta($_)};
 push (@homoglyphs, $symbol, \@matches)
}

#print Dumper(\@homoglyphs);
#__END__

print "Comparing '$test_string' against '$target_word' taking into account homogylphs\n\n";

my $target_pattern = $target_word;
while (scalar(@homoglyphs)) {
  my ($str, $ra_values) =  (shift(@homoglyphs),shift(@homoglyphs));
  next unless (scalar(@{$ra_values}));
  my $pattern = ('('.join('|', $str,@{$ra_values}).')');
  
  print "$str -> $pattern\n";
  $target_pattern =~ s"$str"$pattern"ig; #"
  print "\t$target_pattern\n";
}

if ($test_string =~ m"($target_pattern)"i) {
  print "\n\tMatch: '$1' matches '$target_word'.\n";
}
 

Result:

Comparing 's0opa d3wd 0ri \/\/|-|3&|z' against 'on wheels' taking into account homogylphs

ee -> (\Qee\E|\Qea\E)
	on wh(ee|ea)ls
aa -> (\Qaa\E|\Qee\E)
	on wh(ee|ea)ls
a -> (\Qa\E|\Q4\E|\Qci\E|\Qaye\E|\Qci\E|\Q/\\\E|\Q/-\\\E|\Q\@\E|\Q∂\E|\Qλ\E|\Q\&\E)
	on wh(ee|e(a|4|ci|aye|ci|\/\\|\/\-\\|\@|∂|λ|\&))ls
b -> (\Qb\E|\Q13\E|\Q!3\E|\Q8\E|\Qß\E|\Q13\E|\QI3\E|\Q|3\E|\QP>\E|\Q|:\E|\Q!3\E|\Q(3\E|\Q/3\E|\Q\\3\E|\Q)3\E|\Q[3\E|\Q]3\E)
	on wh(ee|e(a|4|ci|aye|ci|\/\\|\/\-\\|\@|∂|λ|\&))ls
c -> (\Qc\E|\Qk\E)
	on wh(ee|e(a|4|(c|k)i|aye|(c|k)i|\/\\|\/\-\\|\@|∂|λ|\&))ls
d -> (\Qd\E|\Qcl\E)
	on wh(ee|e(a|4|(c|k)i|aye|(c|k)i|\/\\|\/\-\\|\@|∂|λ|\&))ls
e -> (\Qe\E|\Q3\E)
	on wh((e|3)(e|3)|(e|3)(a|4|(c|k)i|ay(e|3)|(c|k)i|\/\\|\/\-\\|\@|∂|λ|\&))ls
f -> (\Qf\E|\Qph\E)
	on wh((e|3)(e|3)|(e|3)(a|4|(c|k)i|ay(e|3)|(c|k)i|\/\\|\/\-\\|\@|∂|λ|\&))ls
g -> (\Qg\E|\Qcj\E)
	on wh((e|3)(e|3)|(e|3)(a|4|(c|k)i|ay(e|3)|(c|k)i|\/\\|\/\-\\|\@|∂|λ|\&))ls
h -> (\Qh\E|\Q|-|\E)
	on w(h|\|\-\|)((e|3)(e|3)|(e|3)(a|4|(c|k)i|ay(e|3)|(c|k)i|\/\\|\/\-\\|\@|∂|λ|\&))ls
i -> (\Qi\E|\Ql\E|\Q1\E|\Q!\E|\Q|\E)
	on w(h|\|\-\|)((e|3)(e|3)|(e|3)(a|4|(c|k)(i|l|1|\!|\|)|ay(e|3)|(c|k)(i|l|1|\!|\|)|\/\\|\/\-\\|\@|∂|λ|\&))ls
k -> (\Qk\E|\Qc\E)
	on w(h|\|\-\|)((e|3)(e|3)|(e|3)(a|4|(c|(k|c))(i|l|1|\!|\|)|ay(e|3)|(c|(k|c))(i|l|1|\!|\|)|\/\\|\/\-\\|\@|∂|λ|\&))ls
l -> (\Ql\E|\Qi\E|\Q1\E|\Q!\E|\Q|\E)
	on w(h|\|\-\|)((e|3)(e|3)|(e|3)(a|4|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|ay(e|3)|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|\/\\|\/\-\\|\@|∂|λ|\&))(l|i|1|\!|\|)s
m -> (\Qm\E|\Qrn\E)
	on w(h|\|\-\|)((e|3)(e|3)|(e|3)(a|4|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|ay(e|3)|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|\/\\|\/\-\\|\@|∂|λ|\&))(l|i|1|\!|\|)s
n -> (\Qn\E|\Qri\E)
	o(n|ri) w(h|\|\-\|)((e|3)(e|3)|(e|3)(a|4|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|ay(e|3)|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|\/\\|\/\-\\|\@|∂|λ|\&))(l|i|1|\!|\|)s
o -> (\Qo\E|\Q0\E|\Q()\E|\Q(0)\E)
	(o|0|\(\)|\(0\))(n|ri) w(h|\|\-\|)((e|3)(e|3)|(e|3)(a|4|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|ay(e|3)|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|\/\\|\/\-\\|\@|∂|λ|\&))(l|i|1|\!|\|)s
q -> (\Qq\E|\Q0\E)
	(o|0|\(\)|\(0\))(n|ri) w(h|\|\-\|)((e|3)(e|3)|(e|3)(a|4|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|ay(e|3)|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|\/\\|\/\-\\|\@|∂|λ|\&))(l|i|1|\!|\|)s
s -> (\Qs\E|\Q\$\E|\Q5\E|\Qz\E)
	(o|0|\(\)|\(0\))(n|ri) w(h|\|\-\|)((e|3)(e|3)|(e|3)(a|4|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|ay(e|3)|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|\/\\|\/\-\\|\@|∂|λ|\&))(l|i|1|\!|\|)(s|\$|5|z)
u -> (\Qu\E|\Q|_|\E)
	(o|0|\(\)|\(0\))(n|ri) w(h|\|\-\|)((e|3)(e|3)|(e|3)(a|4|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|ay(e|3)|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|\/\\|\/\-\\|\@|∂|λ|\&))(l|i|1|\!|\|)(s|\$|5|z)
v -> (\Qv\E|\Q\\/\E)
	(o|0|\(\)|\(0\))(n|ri) w(h|\|\-\|)((e|3)(e|3)|(e|3)(a|4|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|ay(e|3)|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|\/\\|\/\-\\|\@|∂|λ|\&))(l|i|1|\!|\|)(s|\$|5|z)
w -> (\Qw\E|\Qvv\E|\Q\\/\\/\E)
	(o|0|\(\)|\(0\))(n|ri) (w|vv|\\\/\\\/)(h|\|\-\|)((e|3)(e|3)|(e|3)(a|4|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|ay(e|3)|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|\/\\|\/\-\\|\@|∂|λ|\&))(l|i|1|\!|\|)(s|\$|5|z)
x -> (\Qx\E|\Q\%\E)
	(o|0|\(\)|\(0\))(n|ri) (w|vv|\\\/\\\/)(h|\|\-\|)((e|3)(e|3)|(e|3)(a|4|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|ay(e|3)|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|\/\\|\/\-\\|\@|∂|λ|\&))(l|i|1|\!|\|)(s|\$|5|z)
z -> (\Qz\E|\Q2\E)
	(o|0|\(\)|\(0\))(n|ri) (w|vv|\\\/\\\/)(h|\|\-\|)((e|3)(e|3)|(e|3)(a|4|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|ay(e|3)|(c|(k|c))(i|(l|i|1|\!|\|)|1|\!|\|)|\/\\|\/\-\\|\@|∂|λ|\&))(l|i|1|\!|\|)(s|\$|5|(z|2))
1 -> (\Q1\E|\Qi\E|\Ql\E|\Q!\E)
	(o|0|\(\)|\(0\))(n|ri) (w|vv|\\\/\\\/)(h|\|\-\|)((e|3)(e|3)|(e|3)(a|4|(c|(k|c))(i|(l|i|(1|i|l|\!)|\!|\|)|(1|i|l|\!)|\!|\|)|ay(e|3)|(c|(k|c))(i|(l|i|(1|i|l|\!)|\!|\|)|(1|i|l|\!)|\!|\|)|\/\\|\/\-\\|\@|∂|λ|\&))(l|i|(1|i|l|\!)|\!|\|)(s|\$|5|(z|2))
2 -> (\Q2\E|\Qz\E)
	(o|0|\(\)|\(0\))(n|ri) (w|vv|\\\/\\\/)(h|\|\-\|)((e|3)(e|3)|(e|3)(a|4|(c|(k|c))(i|(l|i|(1|i|l|\!)|\!|\|)|(1|i|l|\!)|\!|\|)|ay(e|3)|(c|(k|c))(i|(l|i|(1|i|l|\!)|\!|\|)|(1|i|l|\!)|\!|\|)|\/\\|\/\-\\|\@|∂|λ|\&))(l|i|(1|i|l|\!)|\!|\|)(s|\$|5|(z|(2|z)))
3 -> (\Q3\E|\QE\E)
	(o|0|\(\)|\(0\))(n|ri) (w|vv|\\\/\\\/)(h|\|\-\|)((e|(3|E))(e|(3|E))|(e|(3|E))(a|4|(c|(k|c))(i|(l|i|(1|i|l|\!)|\!|\|)|(1|i|l|\!)|\!|\|)|ay(e|(3|E))|(c|(k|c))(i|(l|i|(1|i|l|\!)|\!|\|)|(1|i|l|\!)|\!|\|)|\/\\|\/\-\\|\@|∂|λ|\&))(l|i|(1|i|l|\!)|\!|\|)(s|\$|5|(z|(2|z)))
5 -> (\Q5\E|\Qs\E)
	(o|0|\(\)|\(0\))(n|ri) (w|vv|\\\/\\\/)(h|\|\-\|)((e|(3|E))(e|(3|E))|(e|(3|E))(a|4|(c|(k|c))(i|(l|i|(1|i|l|\!)|\!|\|)|(1|i|l|\!)|\!|\|)|ay(e|(3|E))|(c|(k|c))(i|(l|i|(1|i|l|\!)|\!|\|)|(1|i|l|\!)|\!|\|)|\/\\|\/\-\\|\@|∂|λ|\&))(l|i|(1|i|l|\!)|\!|\|)(s|\$|(5|s)|(z|(2|z)))
0 -> (\Q0\E|\Qo\E|\Q()\E|\Q(0)\E)
	(o|(0|o|\(\)|\(0\))|\(\)|\((0|o|\(\)|\(0\))\))(n|ri) (w|vv|\\\/\\\/)(h|\|\-\|)((e|(3|E))(e|(3|E))|(e|(3|E))(a|4|(c|(k|c))(i|(l|i|(1|i|l|\!)|\!|\|)|(1|i|l|\!)|\!|\|)|ay(e|(3|E))|(c|(k|c))(i|(l|i|(1|i|l|\!)|\!|\|)|(1|i|l|\!)|\!|\|)|\/\\|\/\-\\|\@|∂|λ|\&))(l|i|(1|i|l|\!)|\!|\|)(s|\$|(5|s)|(z|(2|z)))

	Match: '0ri \/\/|-|3&|z' matches 'on wheels'.