#!/usr/bin/perl -w
use strict;

#Sample from biographies.list

#-------------------------------------------------------------------------------
#NM: Bauer, Hank (II)
#
#RN: Henry Albert Bauer
#
#DB: 31 July 1922, East St. Louis, Illinois, USA

print "Working: ";
my $master={};
my $nm=undef;
my $marking=undef;
my $linenumber=1;
my $dotcount=100000;
open (BIOS,"biographies.list") || die "Can't find the biographies list!";
while (<BIOS>) {
    chomp $_;
    if (($linenumber/$dotcount)==int($linenumber/$dotcount)) {print ".";}
    $linenumber++;
    if ($_=~/^(NM|RN|DB): /){
        $marking=$1;
        my @data=split(/..: /,$_);
        if ($marking eq "NM") {
            $nm=$data[1];
            $master->{$nm}->{"count"}++;    
        }
        if ($marking eq "RN") {$master->{$nm}->{"rn"}=$data[1];}
        if ($marking eq "DB") {$master->{$nm}->{"db"}=$data[1];}
    }
}
foreach my $namekey (keys(%$master)) {
    my $purenamekey=$namekey;
    $purenamekey=~s/'//;
    $purenamekey=~s/"//;
    $purenamekey=~s/\(.*\)//;
    $purenamekey=~s/^\s*//;
    $purenamekey=~s/\s*$//;
    $master->{$namekey}->{"purekey"}=$purenamekey;
    my @nameparts1=split(",",$purenamekey);
    foreach (@nameparts1) {
        $_=~s/^\s*//;
        $_=~s/\s*$//;
    }
    if (scalar(@nameparts1) <=1) {
        $master->{$namekey}->{"discard"}=1;
    }
    else {$master->{$namekey}->{"discard"}=0;}
    my @nameparts2=split(" ",$nameparts1[0]);
    my $puresurname=pop(@nameparts2);
    if (exists $master->{$namekey}->{"rn"}) {
        my $rn=$master->{$namekey}->{"rn"};
        #Remove Brackets (with contents) and quotes
        $rn=~s/\(.*\)//g;
        $rn=~s/'//g;
        $rn=~s/"//g;
        #remove 'the' to end
        $rn=~s/(^| )the( |$).*//i;
        #remove all dots
        $rn=~s/\.//gi;
        #Remove Roman Numerials and Jr. Sr.
        $rn=~s/(^| )(I|II|III|IV|V|VI|VII|VIII|IX|X|XI|XII|XIII)( |$)//i;
        $rn=~s/(^| )(Jr|Sr)( |$)//i;
        #remove initials?
        $rn=~s/(^| )\w( |$)//gi;
        #If there is a comma take the last word on the left of it
        if ($rn=~/,/) {
            my @realnameparts1=split(",",$rn);
            my @realnameparts2=split(" ",$realnameparts1[0]);
            my $realsurname=pop(@realnameparts2);
            if (defined ($realsurname)) {
                $realsurname=~s/^\s*//;
                $realsurname=~s/\s*$//;
                $master->{$namekey}->{"realsurname"}=$realsurname;
            }
            else {
                $master->{$namekey}->{"realsurname"}=$rn;
            }
        }
        else {
            my @realnameparts=split(" ",$rn);
            my $realsurname=pop(@realnameparts);
            if (defined ($realsurname)) {
                $realsurname=~s/^\s*//;
                $realsurname=~s/\s*$//;
                $master->{$namekey}->{"realsurname"}=$realsurname;
            }
            else {
                $master->{$namekey}->{"realsurname"}=$puresurname;
            }
        }
        $master->{$namekey}->{"realsurname"}=~s/\s*$//;
        $master->{$namekey}->{"realsurname"}=~s/^\s*//;
    }
    else {
        $master->{$namekey}->{"realsurname"}=$puresurname;
    }
    if (exists $master->{$namekey}->{"db"}) {
        my @dateparts=split(",",$master->{$namekey}->{"db"});
        my $birthcountry=pop(@dateparts);
        if (defined $birthcountry) {
            if ($birthcountry=~/\d/) {
                $birthcountry=undef; #There was only a date
            }
            else {
                $birthcountry=~s/^\s*//;
                $birthcountry=~s/\s*$//;
                $master->{$namekey}->{"birthcountry"}=$birthcountry;
            }
        }
        else {
            $birthcountry=undef;
        }
    }
}

print "\nWait for it: ... (just another minute)";
foreach my $namekey (keys(%$master)) {
    #Do some last minute fixing to the realsurname
    $master->{$namekey}->{"realsurname"}=~s/\.//g;
    $master->{$namekey}->{"realsurname"}=~s/(^| )\w( |$)//ig;
    $master->{$namekey}->{"realsurname"}=~s/(^| )\d*( |$)//g;
    #Doublecheck that we have a single realname
    my $realname=$master->{$namekey}->{"realsurname"};
    if ($realname=~/ /){
        my @nameparts=split(/ /,$realname);
        $master->{$namekey}->{"realsurname"}=pop(@nameparts);
        $realname=$master->{$namekey}->{"realsurname"};
    }
    #Do some last minute fixing to the birthcountry
    my $bc=$master->{$namekey}->{"birthcountry"};
    if (defined $bc) {
        $bc=~s/\[/(/g;
        $bc=~s/\]/)/g;
        if ($bc=~/\((.*)\)/){
            my $update=$1;
            $update=~s/^now //i;
            $bc=$update;
        }
        $bc=~s/\)//g;
        $bc=~s/\(//g;
        $bc=~s/"//g;
        $bc=~s/'//g;
        if ($bc eq "West") {
            $bc="Germany"
        }
        $master->{$namekey}->{"birthcountry"}=$bc;
    }
}

#Dump the output
open (OUTPUT,">IMDB-BiosData.txt") || die "Can't open the outfile";
print OUTPUT "NameKey\tPureNameKey\tDiscard\tRealSurname\tBirthCountry\tRealNameString\tBirthString\n";
foreach my $namekey (keys(%$master)) {
    print OUTPUT $namekey."\t";
    print OUTPUT $master->{$namekey}->{"purekey"}."\t";
    print OUTPUT $master->{$namekey}->{"discard"}."\t";
    if (!exists $master->{$namekey}->{"realsurname"}) {$master->{$namekey}->{"realsurname"}="";}
    if (!exists $master->{$namekey}->{"birthcountry"}) {$master->{$namekey}->{"birthcountry"}="";}
    if (!exists $master->{$namekey}->{"rn"}) {$master->{$namekey}->{"rn"}="";}
    if (!exists $master->{$namekey}->{"db"}) {$master->{$namekey}->{"db"}="";}
    print OUTPUT $master->{$namekey}->{"realsurname"}."\t";
    print OUTPUT $master->{$namekey}->{"birthcountry"}."\t";
    print OUTPUT $master->{$namekey}->{"rn"}."\t";
    print OUTPUT $master->{$namekey}->{"db"}."\n";
}
close (OUTPUT);
print "\nDone!\n";
