Difference between revisions of "PhD Masterclass - How to Build a Web Crawler"

From edegan.com
Jump to navigation Jump to search
imported>Ed
(Created page with "This page provides resources for the PhD Masterclass "How to Build a Web Crawler", which I gave on Friday 28th January 2011 to interested PhD students at Haas. ==Tools== *[h...")
 
(3 intermediate revisions by one other user not shown)
Line 26: Line 26:
 
  perl Script1.pl
 
  perl Script1.pl
  
Or we can run it in command by going:
+
Or we can run it in Komodo by going:
 
  Debug->Go
 
  Debug->Go
  
Line 34: Line 34:
 
  Use PuTTY to connect to bear.haas.berkeley.edu (see [[Research Computing At Haas|here]]).
 
  Use PuTTY to connect to bear.haas.berkeley.edu (see [[Research Computing At Haas|here]]).
 
  perl Script1.pl
 
  perl Script1.pl
 
  
 
===Processing Text Data===
 
===Processing Text Data===
Line 69: Line 68:
 
  #!/usr/bin/perl -w
 
  #!/usr/bin/perl -w
 
  #Lines that start with a # are comments that aren't read by the interpreter
 
  #Lines that start with a # are comments that aren't read by the interpreter
 
+
 
  use strict;
 
  use strict;
 
  #The strict module forces us to declare variables before we use them
 
  #The strict module forces us to declare variables before we use them
 
+
 
  my @Textfile;
 
  my @Textfile;
 
  #Declare an array called TextFile
 
  #Declare an array called TextFile
 
+
 
  open (DATA,"Data.txt");
 
  open (DATA,"Data.txt");
 
  #Open a filehandle on our file
 
  #Open a filehandle on our file
 
+
 
  while (<DATA>) {
 
  while (<DATA>) {
 
  #Read the data from the filehandle, line by line
 
  #Read the data from the filehandle, line by line
 
+
 
     chomp $_;
 
     chomp $_;
 
     #$_ is a special variable - it captures the line being read from the filehandle here
 
     #$_ is a special variable - it captures the line being read from the filehandle here
 
+
 
     if (!$_) {next;}
 
     if (!$_) {next;}
 
     #if the line is undefined (i.e. blank) move to the next loop iteration
 
     #if the line is undefined (i.e. blank) move to the next loop iteration
 
+
 
     my $line = $_;  
 
     my $line = $_;  
 
     #Set a local variable called line to $_
 
     #Set a local variable called line to $_
 
+
 
     push (@Textfile, $line);
 
     push (@Textfile, $line);
 
     #Push the line onto the Textfile array
 
     #Push the line onto the Textfile array
 
  }
 
  }
 
+
 
  my $Doccell;
 
  my $Doccell;
 
  #Declare the Doccell variable
 
  #Declare the Doccell variable
 
+
 
  for (my $i=0; $i<=$#Textfile; $i++) {
 
  for (my $i=0; $i<=$#Textfile; $i++) {
 
  #Do a for loop, starting from i=0, going while i is less than the  
 
  #Do a for loop, starting from i=0, going while i is less than the  
 
  #last index of the Textfile array, and incrementing by one each time
 
  #last index of the Textfile array, and incrementing by one each time
 
+
 
     if ($Textfile[$i]=~/^Document\(s\):/) {$Doccell=$i;}
 
     if ($Textfile[$i]=~/^Document\(s\):/) {$Doccell=$i;}
 
     #Test to see if the entry matches a regular expression, if it does record the index
 
     #Test to see if the entry matches a regular expression, if it does record the index
 
  }
 
  }
 
+
 
  my @docs = splice(@Textfile,$Doccell);
 
  my @docs = splice(@Textfile,$Doccell);
 
  #Create a next array by splicing out everything after the index we just found
 
  #Create a next array by splicing out everything after the index we just found
 
+
 
  shift @docs;
 
  shift @docs;
 
  #Remove the first element of the docs array
 
  #Remove the first element of the docs array
 
+
 
  my $Firm = shift @Textfile;
 
  my $Firm = shift @Textfile;
 
  #Set Firm equal to the first element of Textfile (which we just removed)
 
  #Set Firm equal to the first element of Textfile (which we just removed)
 
+
 
  my $Violation =shift(@Textfile);
 
  my $Violation =shift(@Textfile);
 
  #Set Violation equal to the (new) first element of Textfile (which we just removed)
 
  #Set Violation equal to the (new) first element of Textfile (which we just removed)
 
+
 
  my $Offense={};
 
  my $Offense={};
 
  #Create an anonymous hash
 
  #Create an anonymous hash
 
+
 
  foreach my $cell (@Textfile) {\
 
  foreach my $cell (@Textfile) {\
 
  #Iterative over Textfile, setting the current iteration to cell
 
  #Iterative over Textfile, setting the current iteration to cell
 
+
 
     my ($name,@value)=split(":",$cell);
 
     my ($name,@value)=split(":",$cell);
 
     #Spill the cell on :
 
     #Spill the cell on :
 
+
 
     my $value=join(":",@value);
 
     my $value=join(":",@value);
 
     #Join the Value array on :
 
     #Join the Value array on :
 
+
 
     $Offense->{$name}=$value;
 
     $Offense->{$name}=$value;
 
     #Set an entry in the Offense hash
 
     #Set an entry in the Offense hash
 
  }
 
  }
 
+
 
  $Offense->{"DocList"}=\@docs;
 
  $Offense->{"DocList"}=\@docs;
 
  #Set the doclist entry in the Offense hash to a reference to the docs array
 
  #Set the doclist entry in the Offense hash to a reference to the docs array
Line 139: Line 138:
 
  my $Master=[];
 
  my $Master=[];
 
  #Define an anonymous array
 
  #Define an anonymous array
 
+
 
  $Master->[0]={};
 
  $Master->[0]={};
 
  #Define an anonymous hash in the zeroth cell of the anonymous array
 
  #Define an anonymous hash in the zeroth cell of the anonymous array
 
+
 
  $Master->[0]->{FirmName}=$Firm;
 
  $Master->[0]->{FirmName}=$Firm;
 
  #Set a hash entry
 
  #Set a hash entry
 
+
 
  $Master->[0]->{Offense}=$Offense;
 
  $Master->[0]->{Offense}=$Offense;
 
  #Set a hash entry
 
  #Set a hash entry
 
+
 
  $Master->[0]->{Violation}=$Violation;
 
  $Master->[0]->{Violation}=$Violation;
 
  #Set a hash entry
 
  #Set a hash entry
Line 154: Line 153:
 
  open(OUTPUT,">Result.txt");
 
  open(OUTPUT,">Result.txt");
 
  #Open a filehandle for writing (overwrite the file if it exists)
 
  #Open a filehandle for writing (overwrite the file if it exists)
 
+
 
  print OUTPUT $Master->[0]->{FirmName};
 
  print OUTPUT $Master->[0]->{FirmName};
 
  #Print the output file an entry from the anonymous hash in the anonymous array
 
  #Print the output file an entry from the anonymous hash in the anonymous array
 
+
 
  print OUTPUT "\t";
 
  print OUTPUT "\t";
 
  #Print a tab
 
  #Print a tab
 
+
 
  print OUTPUT $Master->[0]->{Violation}."\t";
 
  print OUTPUT $Master->[0]->{Violation}."\t";
 
  #Print another entry with another tab on the end
 
  #Print another entry with another tab on the end
 
+
 
  foreach my $key ( sort {$a cmp $b } (keys %{ $Master->[0]->{Offense} } )) {
 
  foreach my $key ( sort {$a cmp $b } (keys %{ $Master->[0]->{Offense} } )) {
 
  #Iterate through the hash's keys, in alphabetical order, setting the current key to $key
 
  #Iterate through the hash's keys, in alphabetical order, setting the current key to $key
 
+
 
     print OUTPUT  $Master->[0]->{Offense}->{$key}."\t";
 
     print OUTPUT  $Master->[0]->{Offense}->{$key}."\t";
 
     #Print an entry, with a tab
 
     #Print an entry, with a tab
 
  }
 
  }
 
+
 
  print OUTPUT "\n";
 
  print OUTPUT "\n";
 
  #Print a new line
 
  #Print a new line
 
+
 
  close OUTPUT;
 
  close OUTPUT;
 
  #Close the output filehandle - this will flush the write buffer
 
  #Close the output filehandle - this will flush the write buffer
 
+
 
==Modules==
 
==Modules==
  
Line 195: Line 194:
 
  use LWP::UserAgent;
 
  use LWP::UserAgent;
 
  #Use the LWP::UserAgent modules
 
  #Use the LWP::UserAgent modules
 
+
 
  my $ua = LWP::UserAgent->new;
 
  my $ua = LWP::UserAgent->new;
 
  #Create a new UserAgent
 
  #Create a new UserAgent
 
+
 
  my $url="http://www.contractormisconduct.org/index.cfm/1,73,222,html?CaseID=2";
 
  my $url="http://www.contractormisconduct.org/index.cfm/1,73,222,html?CaseID=2";
 
  #Set up a string containing a URL
 
  #Set up a string containing a URL
 
+
 
  my $response = $ua->get($url);
 
  my $response = $ua->get($url);
 
  #Use the UA 'get' method to retrieve the webpage. This returns an HTTP Response object
 
  #Use the UA 'get' method to retrieve the webpage. This returns an HTTP Response object
 
+
 
  my $content=$response->decoded_content;
 
  my $content=$response->decoded_content;
 
  #Get the response as one long text string, so we can work with it...
 
  #Get the response as one long text string, so we can work with it...
Line 212: Line 211:
 
  use HTML::TreeBuilder;
 
  use HTML::TreeBuilder;
 
  #Use the HTML::TreeBuilder modules
 
  #Use the HTML::TreeBuilder modules
 
+
 
  my $tree = HTML::TreeBuilder->new; # empty tree
 
  my $tree = HTML::TreeBuilder->new; # empty tree
 
  #Create a new tree object
 
  #Create a new tree object
 
+
 
  $tree->parse($content);
 
  $tree->parse($content);
 
  #Load up the tree from the content string (that we got using UA)
 
  #Load up the tree from the content string (that we got using UA)
 
+
 
  my $dump=$tree->as_text;
 
  my $dump=$tree->as_text;
 
  #Dump the tree as text maybe
 
  #Dump the tree as text maybe
 
+
 
  my $incidentelement=$tree->look_down("id","primecontent");
 
  my $incidentelement=$tree->look_down("id","primecontent");
 
  #Or use HTML::Element methods to look_down the tree for a tag with some properties
 
  #Or use HTML::Element methods to look_down the tree for a tag with some properties
 +
 +
==An Example Webcrawler==
 +
 +
I wrote the following simple webcrawler for a fellow PhD student:
 +
 +
#!/usr/bin/perl -w
 +
use strict;
 +
 +
use LWP::UserAgent;
 +
#Use the LWP::UserAgent modules
 +
use HTML::TreeBuilder;
 +
#Use the HTML::TreeBuilder modules
 +
 +
my $ua = LWP::UserAgent->new;
 +
#Create a new UserAgent
 +
 +
my @Pkids;
 +
open (PKIDS,"Pkidfile.txt") || die "Can't open the PKID file to read $!";
 +
#Open the Pkid file to read - this file has a Pkid on each line. You can get some from here: http://myaccount.sdar.com/RealtorSrch.asp
 +
 +
while (<PKIDS>) {
 +
    #Read the pkid file line by line
 +
   
 +
    chomp $_;
 +
    #Remove the \n (newline symbol) from each line
 +
   
 +
    push(@Pkids,$_);
 +
    #Add the PKID to an array
 +
}
 +
 +
open (RESULTS,">Results.txt") || die "Can't write the Results.txt file $!";
 +
#Open the Results file to write
 +
 +
my $headerflag=0;
 +
#Set a flag to indicate whether we wrote the header line to the output file
 +
 +
foreach my $Pkid (sort (@Pkids)) {
 +
    #Go through the PKIDs in order
 +
   
 +
    my $url="http://myaccount.sdar.com/RealtorSrchDetail.asp?PKID=".$Pkid;
 +
    #Set up a string containing a URL
 +
 +
    my $response = $ua->get($url);
 +
    #Use the UA 'get' method to retrieve the webpage. This returns an HTTP Response object
 +
 +
    my $content=$response->decoded_content;
 +
    #Get the response as one long text string, so we can work with it...
 +
 +
    my $tree = HTML::TreeBuilder->new; # empty tree
 +
    #Create a new tree object
 +
 +
    $tree->parse($content);
 +
    #Load up the tree from the content string (that we got using UA)
 +
 +
    my $name=$tree->look_down("width","520");
 +
    #Find an element in the HTML that has width=520 (this is where names are stored)
 +
   
 +
    my $nametext=$name->as_text;
 +
    #Convert it to text
 +
   
 +
    $nametext=~s/^\s{1,}//;
 +
    #Remove leading spaces
 +
   
 +
    $nametext=~s/\s{1,}$//;
 +
    #Remove trailing spaces
 +
   
 +
    $nametext=~s/^\s{2,}/ /g;
 +
    #Replace double spaces with a single space, globally
 +
   
 +
    my @fieldstext;
 +
    #Declare an array
 +
   
 +
    my @fields=$tree->look_down("class","field_labels");
 +
    #Find all of the field elements
 +
   
 +
    foreach my $field (@fields) {
 +
    #Go through them
 +
   
 +
        my $fieldparent=$field->parent;
 +
        #Go to their parent
 +
       
 +
        my $fieldparenttext=$fieldparent->as_text;
 +
        #Turn the parent into text
 +
       
 +
        $fieldparenttext=~s/^\s{1,}//; $fieldparenttext=~s/\s{1,}$//; $fieldparenttext=~s/^\s{2,}/ /g;
 +
        #Deal with spaces again
 +
       
 +
        push @fieldstext,$fieldparenttext;
 +
        #Add the fields to a list
 +
    }
 +
   
 +
    &writeoutput($Pkid,$nametext,@fieldstext);
 +
    #Call the write output subroutine
 +
   
 +
    $content=undef;  $tree=undef;  $name=undef;  undef @fields;
 +
    #Set a bunch of variables to undefined - this frees up memory
 +
   
 +
    sleep(2);
 +
    #Pause for a second or two...
 +
}
 +
 +
close (RESULTS);
 +
#Close the Results filehandle - this flushes the write buffer
 +
 +
sub writeoutput {
 +
#Declare the writeoutput subroutine
 +
 +
    my $data={};
 +
    #Set up an anonymous hash
 +
   
 +
    $data->{"A Pkid"}=shift @_;
 +
    #Set the A PKID field to the first parameter passed to the subroutine
 +
   
 +
    $data->{"A Name"}=shift @_;
 +
    #Set the A PKID field to the second parameter passed to the subroutine (the first has now gone)
 +
   
 +
    push(my @fields,@_);
 +
    #Add the remaining parameters to an array
 +
   
 +
    foreach my $field (@fields) {
 +
    #Go through the array
 +
   
 +
        my @fieldparts=split(":",$field);
 +
        #Split the fields on semicolon
 +
       
 +
        my $key=shift(@fieldparts);
 +
        #Set the key
 +
       
 +
        $data->{$key}=join(":",@fieldparts);
 +
        #Write the hash entry
 +
    }
 +
    if (!$headerflag) {
 +
        #If the headflag is 0 then do this
 +
       
 +
        foreach my $key (sort {$a cmp $b} (keys %{$data})) {
 +
        #Go through the keys
 +
       
 +
            print RESULTS $key."\t";
 +
            #Write the key followed by a tab
 +
        }
 +
        print RESULTS "\n";
 +
        #Print a newline
 +
       
 +
        $headerflag=1;
 +
        #Set the headflag to 1
 +
    }
 +
   
 +
    foreach my $key (sort {$a cmp $b} (keys %{$data})) {
 +
        #Go through the keys again
 +
       
 +
        print RESULTS $data->{$key}."\t";
 +
        #This time print the data followed by tabs
 +
    }
 +
    print RESULTS "\n";
 +
    #print a newline
 +
}
 +
 +
print "Thanks to Ed";
 +
#Thank Ed.
 +
 +
[[category:McNair Admin]]
 +
[[admin_classification::Software Tutorial| ]]

Latest revision as of 13:00, 18 July 2016

This page provides resources for the PhD Masterclass "How to Build a Web Crawler", which I gave on Friday 28th January 2011 to interested PhD students at Haas.

Tools

  • Perl - Available with a large set of useful modules for Windows from ActiveState as ActivePerl
  • Komodo - An integrated development environment for Perl available from ActiveState
  • Textpad - A powerful shareware text editor that supports regular expressions

You should download a trial of Komodo to help you learn. The trial is valid for 21 days (longer if you keep changing your system clock). Komodo will let you step through your code, line by line, and see the values that your variables take on.

Perl is a free and open language, with a rich history, so you will find a wealth of information on the web to help you learn and use it.

Sample Perl Code

We wrote a couple of simple scripts together to get to grips with Perl.

Running a Perl Script

The first was (save it in a file called Script1.pl in the root of your R drive):

print "Hello World";

To execute the script we can either open a command prompt and run the script:

Start->Run->"cmd.exe"
R:
perl Script1.pl

Or we can run it in Komodo by going:

Debug->Go

(Under Preferences->Debugger tick the box to avoid being prompted by the debug dialog each time)

Or we can shell on to Bear and run it there:

Use PuTTY to connect to bear.haas.berkeley.edu (see here).
perl Script1.pl

Processing Text Data

Next we went to:

http://www.contractormisconduct.org/index.cfm/1,73,222,html?CaseID=2

And we created a file called Data.txt (saved next to the script) that contained the following:

Accenture
Potential Foreign Corrupt Practices Act Violation
Date:  07/01/2003 (Date of Incident Report)

Misconduct Type:  Ethics

Enforcement Agency:  SEC

Contracting Party:  None

Court Type:  Administrative

Amount:  $0

Disposition:  Pending

Synopsis:  "As previously reported in July 2003, we became aware of an incident..."

Document(s):
•1.  SEC 10-K (p. 34 of 137)

We then wrote the following script to process the data:

#!/usr/bin/perl -w
#Lines that start with a # are comments that aren't read by the interpreter

use strict;
#The strict module forces us to declare variables before we use them

my @Textfile;
#Declare an array called TextFile

open (DATA,"Data.txt");
#Open a filehandle on our file

while () {
#Read the data from the filehandle, line by line

    chomp $_;
    #$_ is a special variable - it captures the line being read from the filehandle here

    if (!$_) {next;}
    #if the line is undefined (i.e. blank) move to the next loop iteration

    my $line = $_; 
    #Set a local variable called line to $_

    push (@Textfile, $line);
    #Push the line onto the Textfile array
}

my $Doccell;
#Declare the Doccell variable

for (my $i=0; $i<=$#Textfile; $i++) {
#Do a for loop, starting from i=0, going while i is less than the 
#last index of the Textfile array, and incrementing by one each time

    if ($Textfile[$i]=~/^Document\(s\):/) {$Doccell=$i;}
    #Test to see if the entry matches a regular expression, if it does record the index
}

my @docs = splice(@Textfile,$Doccell);
#Create a next array by splicing out everything after the index we just found

shift @docs;
#Remove the first element of the docs array

my $Firm = shift @Textfile;
#Set Firm equal to the first element of Textfile (which we just removed)

my $Violation =shift(@Textfile);
#Set Violation equal to the (new) first element of Textfile (which we just removed)

my $Offense={};
#Create an anonymous hash

foreach my $cell (@Textfile) {\
#Iterative over Textfile, setting the current iteration to cell

    my ($name,@value)=split(":",$cell);
    #Spill the cell on :

    my $value=join(":",@value);
    #Join the Value array on :

    $Offense->{$name}=$value;
    #Set an entry in the Offense hash
}

$Offense->{"DocList"}=\@docs;
#Set the doclist entry in the Offense hash to a reference to the docs array

my $Master=[];
#Define an anonymous array

$Master->[0]={};
#Define an anonymous hash in the zeroth cell of the anonymous array

$Master->[0]->{FirmName}=$Firm;
#Set a hash entry

$Master->[0]->{Offense}=$Offense;
#Set a hash entry

$Master->[0]->{Violation}=$Violation;
#Set a hash entry

open(OUTPUT,">Result.txt");
#Open a filehandle for writing (overwrite the file if it exists)

print OUTPUT $Master->[0]->{FirmName};
#Print the output file an entry from the anonymous hash in the anonymous array

print OUTPUT "\t";
#Print a tab

print OUTPUT $Master->[0]->{Violation}."\t";
#Print another entry with another tab on the end

foreach my $key ( sort {$a cmp $b } (keys %{ $Master->[0]->{Offense} } )) {
#Iterate through the hash's keys, in alphabetical order, setting the current key to $key

    print OUTPUT  $Master->[0]->{Offense}->{$key}."\t";
    #Print an entry, with a tab
}

print OUTPUT "\n";
#Print a new line

close OUTPUT;
#Close the output filehandle - this will flush the write buffer

Modules

One of the joys of Perl is CPAN - The Comprehensive Perl Archive Network which acts as repository for perl modules (as well as scripts, distros and much else). There are modules written by people from all over the world for almost every conceivable purpose. There is usually no need to reinvent the wheel in Perl - just grab a module (e.g. Wheel::Base)!

We tested some code using LWP::UserAgent and HTML::TreeBuilder. Useful documentation is here:

Below is a simple UserAgent example:

use LWP::UserAgent;
#Use the LWP::UserAgent modules

my $ua = LWP::UserAgent->new;
#Create a new UserAgent

my $url="http://www.contractormisconduct.org/index.cfm/1,73,222,html?CaseID=2";
#Set up a string containing a URL

my $response = $ua->get($url);
#Use the UA 'get' method to retrieve the webpage. This returns an HTTP Response object

my $content=$response->decoded_content;
#Get the response as one long text string, so we can work with it...

And now for a TreeBuilder example:

use HTML::TreeBuilder;
#Use the HTML::TreeBuilder modules

my $tree = HTML::TreeBuilder->new; # empty tree
#Create a new tree object

$tree->parse($content);
#Load up the tree from the content string (that we got using UA)

my $dump=$tree->as_text;
#Dump the tree as text maybe

my $incidentelement=$tree->look_down("id","primecontent");
#Or use HTML::Element methods to look_down the tree for a tag with some properties

An Example Webcrawler

I wrote the following simple webcrawler for a fellow PhD student:

#!/usr/bin/perl -w
use strict;

use LWP::UserAgent;
#Use the LWP::UserAgent modules
use HTML::TreeBuilder;
#Use the HTML::TreeBuilder modules

my $ua = LWP::UserAgent->new;
#Create a new UserAgent

my @Pkids;
open (PKIDS,"Pkidfile.txt") || die "Can't open the PKID file to read $!";
#Open the Pkid file to read - this file has a Pkid on each line. You can get some from here: http://myaccount.sdar.com/RealtorSrch.asp

while (<PKIDS>) {
    #Read the pkid file line by line
    
    chomp $_;
    #Remove the \n (newline symbol) from each line
    
    push(@Pkids,$_);
    #Add the PKID to an array
}

open (RESULTS,">Results.txt") || die "Can't write the Results.txt file $!";
#Open the Results file to write

my $headerflag=0;
#Set a flag to indicate whether we wrote the header line to the output file

foreach my $Pkid (sort (@Pkids)) {
    #Go through the PKIDs in order
    
    my $url="http://myaccount.sdar.com/RealtorSrchDetail.asp?PKID=".$Pkid;
    #Set up a string containing a URL

    my $response = $ua->get($url);
    #Use the UA 'get' method to retrieve the webpage. This returns an HTTP Response object

    my $content=$response->decoded_content;
    #Get the response as one long text string, so we can work with it...

    my $tree = HTML::TreeBuilder->new; # empty tree
    #Create a new tree object

    $tree->parse($content);
    #Load up the tree from the content string (that we got using UA)

    my $name=$tree->look_down("width","520");
    #Find an element in the HTML that has width=520 (this is where names are stored)
    
    my $nametext=$name->as_text;
    #Convert it to text
    
    $nametext=~s/^\s{1,}//;
    #Remove leading spaces
    
    $nametext=~s/\s{1,}$//;
    #Remove trailing spaces
    
    $nametext=~s/^\s{2,}/ /g;
    #Replace double spaces with a single space, globally
    
    my @fieldstext;
    #Declare an array
    
    my @fields=$tree->look_down("class","field_labels");
    #Find all of the field elements
    
    foreach my $field (@fields) {
    #Go through them
    
        my $fieldparent=$field->parent;
        #Go to their parent
        
        my $fieldparenttext=$fieldparent->as_text;
        #Turn the parent into text
        
        $fieldparenttext=~s/^\s{1,}//; $fieldparenttext=~s/\s{1,}$//; $fieldparenttext=~s/^\s{2,}/ /g;
        #Deal with spaces again
        
        push @fieldstext,$fieldparenttext;
        #Add the fields to a list
    }
    
    &writeoutput($Pkid,$nametext,@fieldstext);
    #Call the write output subroutine
    
    $content=undef;  $tree=undef;  $name=undef;  undef @fields;
    #Set a bunch of variables to undefined - this frees up memory
    
    sleep(2);
    #Pause for a second or two...
}

close (RESULTS);
#Close the Results filehandle - this flushes the write buffer

sub writeoutput {
#Declare the writeoutput subroutine

    my $data={};
    #Set up an anonymous hash
    
    $data->{"A Pkid"}=shift @_;
    #Set the A PKID field to the first parameter passed to the subroutine
    
    $data->{"A Name"}=shift @_;
    #Set the A PKID field to the second parameter passed to the subroutine (the first has now gone)
    
    push(my @fields,@_);
    #Add the remaining parameters to an array
    
    foreach my $field (@fields) {
    #Go through the array
    
        my @fieldparts=split(":",$field);
        #Split the fields on semicolon
        
        my $key=shift(@fieldparts);
        #Set the key
        
        $data->{$key}=join(":",@fieldparts);
        #Write the hash entry
    }
    if (!$headerflag) {
        #If the headflag is 0 then do this
        
        foreach my $key (sort {$a cmp $b} (keys %{$data})) {
        #Go through the keys
        
            print RESULTS $key."\t";
            #Write the key followed by a tab
        }
        print RESULTS "\n";
        #Print a newline
        
        $headerflag=1;
        #Set the headflag to 1
    }
    
    foreach my $key (sort {$a cmp $b} (keys %{$data})) {
        #Go through the keys again
        
        print RESULTS $data->{$key}."\t";
        #This time print the data followed by tabs
    }
    print RESULTS "\n";
    #print a newline
}

print "Thanks to Ed";
#Thank Ed.