#!/usr/bin/perl 

=head1 NAME

CUICollector.pl - Scrapes files from the MetaMapped Medline Baseline project to
build a database of CUI bigram scores. 

=head1 SYNOPSIS

    $ perl CUICollector.pl --directory metamapped-baseline/2014/ 
    CUICollector 0.04 - (C) 2015 Keith Herbert and Bridget McInnes, PhD
    Released under the GNU GPL.
    Connecting to CUI_DB on localhost
    Parsing file: /home/share/data/metamapped-baseline/2014/text.out_01.gz
    Parsing file: /home/share/data/metamapped-baseline/2014/text.out_02.gz
    Parsing file: /home/share/data/metamapped-baseline/2014/text.out_03.gz
    Parsing file: /home/share/data/metamapped-baseline/2014/text.out_02.gz
    Parsing file: /home/share/data/metamapped-baseline/2014/text.out_03.gz
    Entering scores into CUI_DB
    ...
    Finished

=head1 USAGE

Usage: CUICollector.pl [DATABASE OPTIONS] [OTHER OPTIONS] [FILES | DIRECTORIES]

=head1 INPUT

=head2 Required Arguments:

=head3 [FILES | DIRECTORIES]

Specify a directory containing *ONLY* compressed MetaMapped Medical Baseline files:
    --directory /path/to/files/

Multiple directories may also be supplied:
    --directory /path/to/first/folder/ /path/to/second/folder/

Likewise, specify a list of individual files
    --files text.out_01.txt.gz text_mm_out_42.txt.gz text_mm_out_314.txt.gz

a glob of files:
    --files /path/to/dir/*.gz

Or just one:
    --files text.out_01.txt.gz 


=head2 Optional Arguments:


=head3 --database STRING        

Database to contain the CUI bigram scores. DEFAULT: CUI_DB

If the database is not found in MySQL, CUICollector will create it for you. 

=head3 --username STRING

Username is required to access the bigram database on MySql. You will be prompted
for it if it is not supplied as an argument. 

=head3 --password STRING

Password is required to access the umls database on MySql. You will be prompted
for it if it is not supplied as an argument. 

=head3 --hostname STRING

Hostname where mysql is located. DEFAULT: localhost

=head3 --port STRING

The port your mysql is using. DEFAULT: 3306

=head3 --file_step INTEGER

How many MetaMapped Medical Baseline files to read between writes to the database. 
DEFAULT: 5

MMO files can be rather large so setting a low file_step reduces the memory footprint of the script. However, setting a higher file_step reduces the number of write operations to the database.

=head3 --debug 

Sets the debug flag for testing. NOTE: extremely verbose.

=head3 --verbose 

Print the current status of the program to STDOUT. This indicates the files being processed and when the program is writing to the database. This is the default output setting.

=head3 --quiet 

Don't print anything to STDOUT.

=head3 --help

Displays the quick summary of program options.

=head1 OUTPUT

By default, CUICollector prints he current status of the program as it works through the Metamapped Medline Output files (disable with `--quiet`). It creates a database (or connects to an existing one) and adds bigram scores of the CUIs it encounters in the MMO files. 

The resulting database will have four tables:

=over

=item N_11
    cui_1   cui_2   n_11
This shows the count (n_11) for every time a particular CUI (cui_1) is immediately followed by another particular CUI (cui_2) in an utterance. 

=item N_1P
    cui_1   n_1p
This shows the count (n_11) for every time a particular CUI (cui_1) is followed by any CUI in an utterance. 

=item N_P1
    cui_2   n_p1
This shows the count (n_p1) for every time a particular CUI (cui_2) is immediately preceded by any CUI in an utterance. 

=item N_PP
    n_pp
This single value is the total count of all cui_1, cui_2 bigram pairs. 

=back

=head1 AUTHOR

 Keith Herbert, Virginia Commonwealth University

=head1 COPYRIGHT

Copyright (c) 2015,
Keith Herbert, Virginia Commonwealth University
herbertkb at vcu edu


This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; either version 2 of the License, or (at your option) any later
version.

This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with
this program; if not, write to:

 The Free Software Foundation, Inc.,
 59 Temple Place - Suite 330,
 Boston, MA  02111-1307, USA.

=cut

###############################################################################
#                               THE CODE STARTS HERE
###############################################################################

use strict;
use warnings;

use Compress::Zlib; 
use Data::Dumper;
use DBI;
use Getopt::Long;

use feature qw(say);

$|=1;   # disables buffering to STDOUT

###############################################################################
# CONSTANT STRINGS
###############################################################################

my $version = "0.19";
my $header = 
"CUICollector $version - (C) 2015 Keith Herbert and Bridget McInnes, PhD\n"
."Released under the GNU GPL.";

my $usage = $header."\n"
."FLAGS\n"
."--debug       Print EVERYTHING to STDERR.\n"
."--verbose     Print log of files processed to STDOUT (DEFAULT)\n"
."--quiet       Print NOTHING to STDERR or STDOUT.\n"
."--help        Print this help screen.\n"
."DATABASE OPTIONS\n"
."--database    Name of MySQL database to store CUI data. (Default=CUI_DB)\n"
."--hostname    Hostname for MySQL database. (Default=localhost)\n"
."--username    Username for MySQL access. (Optional, but will be prompted)\n"
."--password    Password for MySQL access. (Optional, but will be prompted)\n"
."MetaMapped Baseline Files\n"
."--file_step   How many files to read between database writes. (DEFAULT=5)\n"
."--directory   Path to directory containing *only* MetaMap Baseline files\n"
."--files       Explicit list of one or more MetaMap Baseline files\n"
."\nUSAGE EXAMPLES\n"
."Open directory ./metamap/ and write to database CUIs on localhost:\n"
."\tperl CUICollector.pl --database CUIs --directory metamap/\n\n"
."Open single decompressed file test.mmb and write to default database:\n"
."\tperl CUICollector.pl --plain --file test.mmb\n"
;

###############################################################################
#                           Parse command line options 
###############################################################################
my $DEBUG = 0;      # Prints EVERYTHING. Use with small testing files.
my $VERBOSE = 1;    # Only print for reading from files, writing to database          
my $HELP = '';      # Prints usage and exits if true.


my $database = "CUI_Bigrams";        # Values needed to connect to MySQL dbase.
my $hostname = "localhost";
my $port     = "3306";
my $username;
my $password;

my $file_step = 5;  # How many files to read between writes to database.
my @dirs    = ();   # Directories containing *only* MetaMap Baseline files
my @files   = ();   # Explicit list of MetaMap Baseline files.

GetOptions( 'debug'         => \$DEBUG, 
            'help'          => \$HELP,
            'verbose!'      => \$VERBOSE,
            'quiet'         => sub { $VERBOSE = 0 }, 
            'database=s'    => \$database,
            'hostname=s'    => \$hostname,
            'port=s'        => \$port,
            'username=s'    => \$username,
            'password=s'    => \$password,
            'file_step=i'   => \$file_step,
            'directory=s{1,}' => \@dirs,
            'files=s{1,}'   => \@files          
);

die $usage unless $#ARGV;    
die $usage if $HELP;               
die("Invalid file_step. Must be positive integer") if $file_step < 1;
die "*** No input files ***\n$usage" unless @dirs or @files;

say $header if $VERBOSE;

## Prompt for username/pass if they weren't provided.
if (not defined $username){
    print "Enter username for MySQL server on $hostname: ";
    $username = <STDIN>;
    chomp $username;
}
if (not defined $password){     
    print "Enter password for $username: ";
    $password = <STDIN>;
    chomp $password;
}
 

## Collect all files from input into one massive list.
for my $dir (@dirs) {
    opendir(DIR, $dir) || die "Could not open dir ($dir)\n";
    push @files, grep { $_ ne "$dir." and $_ ne "$dir.." } map { "$dir$_" } readdir DIR; 
    close DIR;
}

## Test if all these files are readable to avoid a nasty surprise later.
for my $file (@files) {
    die "Cannot read $file" unless -r $file;
}

@files = sort @files;

###############################################################################
#                                   Main 
###############################################################################
say "Connecting to $database on $hostname" if $VERBOSE;
my $dbh = open_mysql_database($database, $hostname, $port, $username, $password);


while (@files) {
    my @curr_files = splice @files, 0, $file_step;
     
    my($bigram_ref, $n1p_ref, $np1_ref, $npp) = process_files(@curr_files);
    
    say "Entering scores into $database..." if $VERBOSE;    
    update_database($dbh, $bigram_ref, $n1p_ref, $np1_ref, $npp);
    
    if ($DEBUG) {
        print Dumper($bigram_ref);
        print Dumper($n1p_ref);
        print Dumper($np1_ref);
        say $npp;
    }
}

# Close connection to database                       
$dbh->disconnect;

say "Finished." if $VERBOSE;


###############################################################################
#                           Database Subroutines
###############################################################################
sub open_mysql_database {
    my ($dbase, $host, $port, $user, $pass) = (@_);
    
    # See if database exists in the specified DBMS                        
    my @dbases = DBI->data_sources("mysql",
      {"host" => $host, "user" => $user, password => $pass});
    my $dbase_exists = grep /DBI:mysql:$dbase/, @dbases;
    
    # Connect to the database if it exists. Otherwise create it from scratch.
    my $dbh;                
    if ($dbase_exists) {
        $dbh =  DBI->connect("DBI:mysql:database=$dbase;host=$host",
                         $user, $pass,
                         {'RaiseError' => 1});
    } 
    else {
        $dbh = DBI->connect("DBI:mysql:host=$host", $user, $pass,
                            {'RaiseError' => 1});
        create_database($dbh, $dbase);
    }
    
    return $dbh;        # Return the handler to keep connection alive.
}

###############################################################################
sub create_database {
    my $dbh = shift;
    my $dbase = shift;
    
    $dbh->do("CREATE DATABASE $dbase");
    $dbh->do("USE $dbase");
    
    $dbh->do("CREATE TABLE N_PP (n_pp BIGINT UNSIGNED KEY)");
    $dbh->do("INSERT INTO N_PP (n_pp) VALUES (?)", undef, 0);
    
    
     $dbh->do("CREATE TABLE N_11 (   
                    cui_1   CHAR(10)    NOT NULL,
                    cui_2   CHAR(10)    NOT NULL, 
                    n_11    BIGINT      NOT NULL, 
                    PRIMARY KEY (cui_1, cui_2) )"   );
                    
     $dbh->do("CREATE TABLE N_1P (   
                    cui_1   CHAR(10)    NOT NULL,
                    n_1p    BIGINT      NOT NULL, 
                    PRIMARY KEY (cui_1) )"   );
                    
     $dbh->do("CREATE TABLE N_P1 (   
                    cui_2   CHAR(10)    NOT NULL,
                    n_p1    BIGINT      NOT NULL, 
                    PRIMARY KEY (cui_2) )"   );        
    
}

###############################################################################
sub update_row {
    my ($dbh,                   # database handler
        $table,                 # name of table being updated
        $updated_field,         # field being updated
        $updated_value,         # value to change in field
        $key_names_ref,         # array ref for key names of table
        $key_values_ref         # array ref to values to locate row in table
    ) = @_;       
        
    
    # Build conditional statement to locate row in table.
    my $conditional = "";
    if (@$key_names_ref) {
        $conditional = join ' AND ', map {"$_ = ? "} @$key_names_ref;
    }
    else {
        $conditional = "$updated_field > -1";
    }
    # Check if row to update is already in table
    my $select = "SELECT * FROM $table WHERE $conditional LIMIT 1";
    say $select if $DEBUG;
    my $sth = $dbh->prepare($select);
    $sth->execute(@$key_values_ref);
            
    # If it is, update entry with new sum.
    my @match = $sth->fetchrow_array();
    if ( @match ) {
        
        $updated_value += $match[-1];
        
        my $update = "UPDATE $table SET $updated_field = ? WHERE $conditional";
                        
        $dbh->do( $update,undef, $updated_value, @$key_values_ref );
    }
    # Otherwise, insert a new row into the table.     
    else {
    
        # Build the insert statement
        my $key_list = join ', ', @$key_names_ref;
        my $field_length = @$key_names_ref;
        my $insert ="INSERT INTO $table ($key_list, $updated_field) " . 
                        "VALUES (" . '?, ' x $field_length . '?' . ")";
        say $insert if $DEBUG;              
        $dbh->do($insert, undef, @$key_values_ref, $updated_value); 
    }
}

##############################################################################
sub update_database {
    my($dbh, $bigram_ref, $n1p_ref, $np1_ref, $npp) = (@_);
    
    # Add all of the bigram counts to the database
    say STDERR "n11:" if $DEBUG;
    foreach my $cui_1 (keys %$bigram_ref){
        foreach my $cui_2 (keys %{$$bigram_ref{$cui_1}}) {
            my $n_11 = $$bigram_ref{$cui_1}{$cui_2};
            
	        say STDERR "$cui_1\t$cui_2\t$n_11" if $DEBUG;
	        
	        update_row($dbh, 'N_11', 'n_11', $n_11, 
	            ['cui_1', 'cui_2'], [$cui_1, $cui_2] );
        }
    }
         
    # Add all of the CUI_1 sums to database
    print STDERR "n1p: \n" if $DEBUG;
    foreach my $cui_1 (keys %$n1p_ref) {
        my $n_1p = $$n1p_ref{$cui_1};

	    print STDERR "$cui_1 $n_1p \n" if $DEBUG; 
        
        update_row($dbh, 'N_1P', 'n_1p', $n_1p, ['cui_1'], [$cui_1] );
          
    }        
        
    # Add all of the CUI_2 sums to database
    print STDERR "np1: \n" if $DEBUG;
    foreach my $cui_2 (keys %$np1_ref) {
        my $n_p1 = $$np1_ref{$cui_2};
        say "$cui_2 $n_p1" if $DEBUG;
        
        update_row($dbh, 'N_P1', 'n_p1', $n_p1, ['cui_2'], [$cui_2] );        
    }
    
    # Finally, update the table with the total sum of all bigrams observed.  
    update_row($dbh, 'N_PP', 'n_pp', $npp, [], [] );
}

###############################################################################
#                       This is where the magic happens 
###############################################################################

sub process_files {
    my @files = @_;
    
    my %bigrams;   # cui_1 => cui_2 => sum of cui_1 preceeding cui_2
    my %n1p;       # cui => sum of cui as cui_1
    my %np1;       # cui => sum of cui as cui_2
    my $npp;       # total sum of all bigram observations
        
    # Anonymous subroutine to update the bigram and marginal counts
    my $incrementor = sub {
        my($cui_1, $cui_2, $same_phrase) = @_;
        
        unless (exists $$same_phrase{$cui_1}{$cui_2}) { 
            $bigrams{$cui_1}{$cui_2}++;
            $n1p{$cui_1}++;
            $np1{$cui_2}++;
            $npp++;
        }
        
        $$same_phrase{$cui_1}{$cui_2} = 1;
    };
    
    
    for my $file (@files) {
    
        my $gz = gzopen("$file", "rb") or die "Cannot open $file\n";
        say "Parsing file: $file" if $VERBOSE; 
        
        # Count the bigrams in each utterance until the end of the file.         
        until ($gz->gzeof()) {
            count_bigrams( read_utterance($gz), $incrementor );
        }
        
        $gz->gzclose();
    }
    
   return (\%bigrams, \%n1p, \%np1, $npp); 
}

###############################################################################
sub read_utterance {
    my $gz = shift;
   
    my @phrases;
   
   # The following loop will iterate over all the phrases for this utterance
    while ($gz->gzreadline($_)) {
        
        # Finish when we reach the End Of Utterance (EOU) marker
        last if /^'EOU'/;
            
        # Skip all lines that aren't mappings for a phrase in the utterance
        next unless /^mappings/;
                
        # Break mappings into each possible mapping of phrase into CUIs
        my @maps = split /map\(/, $_;

        # Collect the CUIs in each possible mapping (assumes format 'C1234567')
        # as a set of strings
        my @mappings;           
        for my $map (@maps) {
            
           my $CUI_string = join " ", ( $map =~ m/C\d{7}/g );
           
           say $CUI_string if $CUI_string and $DEBUG;
           
           push @mappings, $CUI_string if $CUI_string; 
        }

        push @phrases, \@mappings if @mappings;
    }
   
    return \@phrases;
}

###############################################################################
sub count_bigrams {
    my( $phrases_ref,   # reference to list of mappings for single utterance 
        $incrementor    # anonymous subroutine to update counting hashes
        ) = @_;
        
    my @phrases = @$phrases_ref;
    
    # Iterate through n-1 phrases in utterance
    for (my $i = 0; $i < $#phrases; $i++) {
                
        my @phrase_1 = @{ $phrases[$i]      };  # Mappings for current phrase
        my @phrase_2 = @{ $phrases[$i + 1]  };  # Mappings for the next phrase

        my %prior;  # Tracks bigrams within same phrase to avoid double counting
        
        # Loop through each of the mappings of the current phrase
        foreach my $map_str_p1 ( @phrase_1 ) {
            my @cuis = split ' ', $map_str_p1;
            
            # Count bigrams up to the k-1th CUI
            for (my $k = 0; $k < $#cuis; $k++) {
                my $cui_1 = $cuis[$k];
                my $cui_2 = $cuis[$k+1];
                
                $incrementor->($cui_1, $cui_2, \%prior);
            }
         
            # Count the kth CUI with the first of each of the next phrases maps
            foreach my $map_str_p2 ( @phrase_2 ) {
                (my $first_of_next_phrase) = $map_str_p2 =~ /(C\d{7})/;
                
                $incrementor->($cuis[-1], $first_of_next_phrase, \%prior);
            }
        }
    }
}
