#!/usr/bin/perl # # Generate a bunch of crazy text, and post it to a blog # # use Net::Blogger; use Getopt::Std; # From a list of words and their probabilities, select one, # weighted by probability. sub select_random { my %choices = %{$_[0]}; #Get a hash from a reference. This is goddamn ugly. my $hash_sum = 0; # Get the sums of all the probabilites of the possible next word. foreach $key (keys(%choices)) { $hash_sum += $choices{$key}; } my $random = (rand() * $hash_sum); # Do the actual selection of the next word foreach $key (keys(%choices)) { $random -= $choices{$key}; if($random <= 0) { return $key; } } #Execution should not reach this point. return "SOMETHING HAS GONE HORRIBLY WRONG IN SELECT_RANDOM"; } # Given a path, load it into a hash of hashes of hashes. This is for trigrams. # Using recursion, the algorithm could be generalized to N-grams, but it is # unlikely that this will give any great gain in output quality. The structure # of the hash of hashes of hashes is such that the third rank of hashes # contains the probability that the third word (third rank hash key) will # follow the first two (The first and second rank keys). sub load_corpus { my $filepath = shift; open(CORPUS_FILE, "<", $filepath); my @input = ; my $corpus; foreach $line (@input) { $corpus .= $line; } # strip quotes, parens, and other double-ended punctuation $corpus =~ s/[\*\<\>\(\)\[\]\"\']//g; # strip double-dashes, replace with single dash $corpus =~ s/\-\-/\-/g; # remove elipsis $corpus =~ s/\.\.\./ /g; # pad periods, semicolons, etc with space $corpus =~ s/[!\.\;\:\-\,\?]/ $& /g; # replace all whitespace with one space $corpus =~ s/\s+/ /g; # lowercase everything $corpus = lc $corpus; # split it into an array my @corpus = split(' ', $corpus); my $index = 0; my $end = @corpus; $end -= 3; #This ends up being a hash of hashes of hashes. %words; while($index < $end) { $first = @corpus[$index]; $second = @corpus[$index+1]; $third = @corpus[$index+2]; $words{ $first => { $second => { $third => $words{$first}{$second}{$third}++ } } }; $index++; } return %words; } #Post to blogger sub post{ my $uname="YOUR NAME"; my $password="YOUR PASSWORD"; my $b = Net::Blogger->new(appkey=>'1234567890ABCDEF'); $b->Username($uname); $b->Password($password); $b->BlogId($b->GetBlogId(blogname=>'YOUR BLOG NAME')); my $id = $b->newPost(postbody=>\$output); print "Posted as $id"; } sub usage { print "usage: $0 [-vp] -f file\n"; print "-v Verbose mode.\n"; print "-p Post to blog. By default, does not post.\n"; print "-f filename File to use as input to the text generation algorithm.\n"; print "-l integer Maximum number of words to generate. Punctuation counts as words.\n"; exit(0); } #Seed the RNG, before any calls to rand srand(time()); # Get the options and parse them. If no options are given, display usage. my %options=(); getopts("vpf:l:", \%options) or usage(); # Set the file path to read from and read it # my $corpus_path = "/home/ams/nutter/cutting_edge.txt"; if(defined($options{f})){ %data = load_corpus($options{f}); } else{ print("You must specify an input file\n"); usage(); } #set the word count my $wordcnt=300; if(defined($options{l})){ $wordcnt=$options{l}; } my @seed=("in", "the"); #%data = load_corpus($corpus_path); my $output; #Add the first words to the output $output = $output . join(" ", @seed); for($ii=0; $ii < $wordcnt; $ii++) { %choices = %{$words{$seed[0]}{$seed[1]}}; $next = select_random(\%choices); shift(@seed); $seed[1] = $next; $output = $output . " " . $next; } #Tighten spacing around commas, terminal punctuation, and dashes $output =~ s/ ,/,/g; $output =~ s/ \./\./g; $output =~ s/ !/!/g; $output =~ s/ \?/\?/g; $output =~ s/ - /-/g; #Uppercase letters after terminal punctuation $output =~ s/[\.?!] ([a-z])/". " . uc($1)/eg; # End at end of sentence rather than just cutting off. $output =~ s/[^\.]*$//g; # Capitalize the first letter $output =~ s/^(.)/uc($1)/eg; print $output; #Send it to blogger if desired post() if defined $options{p};