#!/usr/bin/perl

open BUSH, ">bush-cmd";
open KERRY, ">kerry-cmd";
open MOD, ">mod-cmd";

print BUSH "TITLE Words/phrases spoken by Bush during the 2004-09-30 Debate\n";
print KERRY "TITLE Words/phrases spoken by Kerry during the 2004-09-30 Debate\n";
print MOD "TITLE Words/phrases spoken by the moderator during the 2004-09-30 Debate\n";

open IN, "<rawdata-debate1.txt";
@lines = (<IN>);
close IN;
chomp @lines;

%hash;
foreach $line (@lines) {
  ($p,$count,$data) = split("\t", $line);
  $hash{$p}{$data} = $count;
}

sub getcount {
  ($p, $data) = @_;
  $result = $hash{$p}{$data};
  return $result if ($result =~ /^\d+$/);
  return 0 if ($result eq "");
  return getcount($p, $result);
}

%totcount;
%lettercount;
%varietycount;
foreach $line (@lines) {
  ($p,$count,$data) = split("\t", $line);
  $totcount{$p} += getcount($p,$data) if ($data !~ / /);
  $varietycount{$p} += 1 if ($data !~ / /);
  $lettercount{$p} += length($p) * getcount($p, $data) if ($data !~ / /);
}

my %common;
my %all;

foreach $line (@lines) {
  ($p,$count,$data) = split("\t", $line);
  next if ($count !~ /^\d+$/);
  next if ($count == 0);
  $nwords = scalar(split(" ",$data));
  $color = "red";
  $color = "blue" if ($nwords < 4);
  $color = "green" if ($nwords == 1);
  next if ($count == 1 and $nwords != 1);
  print $p "NEWBAR $data\n";
  print $p "NEWSEGMENT $color $count bar of length $count\n";
  if ($nwords == 1) {
    $permil = int($count/$totcount{$p}*100000)/100;
    print $p "BARTEXT $count ($permil<IMG SRC=\"permil.gif\" ALT=\"per mil\">)\n";
  } elsif ($count ne "") {
    print $p "BARTEXT $count\n";
  }
  if ($nwords >= 3 && $count > 1) {
    $common{$data} = 1;
  }
  $all{$data} = 1;
}

foreach $p ("BUSH", "KERRY", "MOD") {
  $c = $totcount{$p};
  $d = $lettercount{$p}/$c;
  $e = $varietycount{$p};
  print $p "LEGENDTEXT (Total $c words spoken, average length $d, from a vocabulary of size $e)\n";
}

close BUSH;
close KERRY;
close MOD;

print STDERR "Written standard profiles.\n";

sub dofile {
  $fname = $_[0];
  $factor = $_[1];
  open IN, "<$fname";
  open OUT, ">$fname-cmd";
 
foreach (<IN>) {
  if (!/\t/) {
    print OUT "TITLE $_";
  } else {
    chomp;
    ($label, $words) = split("\t", $_);
    @words = split("/", $words);
    $kcount = 0;
    $bcount = 0;
    $mcount = 0;
    foreach $word (@words) {
      @temp = split(" ",$word);
      $kcount += getcount("KERRY",$word);
      $mcount += getcount("MOD",$word);
      $bcount += getcount("BUSH",$word);
    }
    $bbcount = $factor * $bcount;
    $kkcount = $factor * $kcount;
    $mmcount = $factor * $mcount;
    print OUT "NEWBAR $label\n";
    print OUT "NEWSEGMENT blue $kkcount bar of length $kcount\n";
    print OUT "NEWSEGMENT green $mmcount bar of length $mcount\n";
    print OUT "NEWSEGMENT red $bbcount bar of length $bcount\n";
    print OUT "BARTEXT $kcount\/$mcount\/$bcount\n";
  }
}
 
print OUT "LEGENDTEXT Blue:Kerry, Green:moderator, Red:Bush\n";
  close IN;
  close OUT;

}

dofile("proper-nouns", 5);
dofile("interesting", 5);
dofile("english-common", 1);

print STDERR "Written user files.\n";

open OUT, ">common-cmd";

print OUT "TITLE Phrases spoken more than once by at least one person\n";

foreach (sort keys %common) {
  @temp = split(" ");
  $kcount = getcount("KERRY",$_);
  $mcount = getcount("MOD",$_);
  $bcount = getcount("BUSH",$_);
  $kcount = 0 if ($kcount == 0);
  $mcount = 0 if ($mcount == 0);
  $bcount = 0 if ($bcount == 0);
  $kkcount = $kcount * 5;
  $mmcount = $mcount * 5;
  $bbcount = $bcount * 5;
  next if ($kcount + $mcount + $bcount < 3);
  print OUT "NEWBAR $_\n";
  print OUT "NEWSEGMENT blue $kkcount bar of length $kcount\n";
  print OUT "NEWSEGMENT green $mmcount bar of length $mcount\n";
  print OUT "NEWSEGMENT red $bbcount bar of length $bcount\n";
  print OUT "BARTEXT $kcount\/$mcount\/$bcount\n";
}
print OUT "LEGENDTEXT Blue:Kerry, Green:moderator, Red:Bush\n";

close OUT;

open OUT, ">everything-cmd";

print OUT "TITLE Everything (sorted by approximate Bush/Kerry ratio)\n";

sub totweight {
  @temp = split(" ", $a);
  $kacount = getcount("KERRY",$a);
  $macount = getcount("MOD",$a);
  $bacount = getcount("BUSH",$a);
  $aval = $kacount + $macount + $bacount;
  @temp = split(" ", $b);
  $kcount = getcount("KERRY",$b);
  $mcount = getcount("MOD",$b);
  $bcount = getcount("BUSH",$b);
  $bval = $kcount + $mcount + $bcount;
#  if (($bval <=> $aval) != 0) {return ($bval <=> $aval)};
  $arat = ($kacount+1)/($bacount+1);
  $brat = ($kcount+1)/($bcount+1);
  if (($arat <=> $brat) != 0) {return ($arat <=> $brat)};
  if (($macount <=> $mcount) != 0) {return ($macount <=> $mcount)};
  return ($a cmp $b);
}

foreach (sort totweight keys %all) {
  @temp = split(" ");
  $kcount = getcount("KERRY",$_);
  $mcount = getcount("MOD",$_);
  $bcount = getcount("BUSH",$_);
  $kcount = 0 if ($kcount == 0);
  $mcount = 0 if ($mcount == 0);
  $bcount = 0 if ($bcount == 0);
  $kkcount = $kcount * 5;
  $mmcount = $mcount * 5;
  $bbcount = $bcount * 5;
  next if ($kcount < 3 and $mcount < 3 and $bcount < 3);
  print OUT "NEWBAR $_\n";
  print OUT "NEWSEGMENT blue $kkcount bar of length $kcount\n";
  print OUT "NEWSEGMENT green $mmcount bar of length $mcount\n";
  print OUT "NEWSEGMENT red $bbcount bar of length $bcount\n";
  print OUT "BARTEXT $kcount\/$mcount\/$bcount\n";
}
print OUT "LEGENDTEXT Blue:Kerry, Green:moderator, Red:Bush\n";

close OUT;

