I’m very happy to know that Perl has global appeal from seeing all the non-English Perl blogs aggregated on Planet Iron Man, but since I’m a (typical American) monoglot, I’d prefer an Iron Man feed with only English articles. So I made one.
It’s available at http://feeds.dagolden.com/ironman-english.xml. It updates hourly from the master feed.
And for the curious, or for anyone who wants to adapt this for other languages, here’s the Perl program that I whipped-up to create the feed:
Update: I’ve also put the code up on Github: ironman-feedfilter
# feedfilter.pl - downloads and filters the Perl Ironman feed for English
# entries. Results sent to STDOUT.
#
# The heuristic filters out entries unless the content is mostly latin
# characters and English is close to the best guess of a language. Short
# entries with code seem to confuse Lingua::Identify, so we take entries that
# seem "close-enough". Tuned via trial-and-error.
#
# Copyright (c) 2010 by David Golden - This may be used or copied under the
# same terms as Perl itself.
use 5.008001;
use strict;
use warnings;
use utf8;
use autodie;
use IO::File;
use Lingua::Identify qw(:language_identification);
use Time::Piece;
use URI;
use XML::Atom::Feed;
$XML::Atom::ForceUnicode = 1;
$XML::Atom::DefaultVersion = "1.0";
# Global heuristic tuning
my $latin_target = 0.95; # 95% latin chars
my $lang_fuzz = 0.02; # English within 2% probability of best language
run();
#--------------------------------------------------------------------------#
sub latin_ratio {
my $string = shift;
my $alpha =()= $string =~ /(\p{Alphabetic})/g;
my $latin =()= $string =~ /(\p{Latin})/g;
return 0 if ! $latin || !$alpha; # !$alpha probably redundant
return $latin / $alpha;
}
sub run {
my $in_feed = XML::Atom::Feed->new(URI->new("http://ironman.enlightenedperl.org"));
my $out_feed = XML::Atom::Feed->new;
$out_feed->title("Planet Iron Man: English Edition");
$out_feed->subtitle( $in_feed->subtitle );
$out_feed->id("tag:feeds.dagolden.com,".gmtime->year().":ironman:english");
$out_feed->generator("XML::Atom/" . XML::Atom->VERSION);
$out_feed->updated( gmtime->datetime . "Z" );
for my $l ( $in_feed->link ) {
$out_feed->link($l);
}
for my $e ( $in_feed->entries ) {
my $content = $e->content->body;
my $latin = latin_ratio($content);
my %lang = langof($content);
my $best = [sort { $lang{$b} <=> $lang{$a} } keys %lang]->[0];
$lang{en} ||= 0;
$out_feed->add_entry($e)
if $latin > $latin_target && ($lang{$best} - $lang{en} < $lang_fuzz);
}
binmode(STDOUT, ":utf8");
print $out_feed->as_xml;
}