package Tantrix::SLDfind; use warnings; use strict; # Country code TLDs. http://www.iana.org/cctld/cctld-whois.htm my $cctld = '[a-z][a-z]'; # close enough :) # Three letter Generic TLDs. http://www.iana.org/gtld/gtld.htm # Spam is unlikely from TLDs reserved for # governments or treaties between governments. # my $tld3 = '(?:biz|com|net|org|pro|gov|edu|mil|int)'; my $gtld = '(?:biz|com|net|org)'; # Second level domains. Prefer to have (?:co|ne|...) etc but we can catch # additional oddities like 'uk.com' this way. my $sld2 = '[a-z][a-z]'; my $sld3 = '[a-z][a-z][a-z]'; # Four letter generic TLDs. http://www.iana.org/gtld/gtld.htm # Until spam appears on any besides .info, ignore them. # my $tld4 = '(?:info|aero|coop|name)'; my $gtld4 = 'info'; # Any char that can appear in a domain my $dc = '[-a-z0-9]'; my $dc4p = "$dc\{4,}"; # Four or more of 'em # No more domain char-like characters my $end = "(?![-a-z0-9.@])"; # Observe that spamvertised domains are unlikely to be valuable three # letter domains. (Even four, frankly, is unlikely.) our $domain_find_re = qr/ (?: (?:$dc4p \. (?:$sld3\.$cctld # com.mx | $sld2\.$gtld # uk.com | $sld2\.$cctld # co.nz | $gtld # net | $cctld) # de ) | (?:$dc+ \. $gtld4) # info ) $end /xi; 1;