#!/usr/local/bin/perl
$path = "/dl1/FLORA/cgi/CFloraChecklist/testing/parseItgaurav/";
$datapath = "/dl1/FLORA/cgi/CFloraChecklist/testing/parseItgaurav/data/";
$imageloc = "/FLORA/centexflora/image/";

@blah=`ls $path`;
$famDocPath = "/dl1/FLORA/cgi/CFloraChecklist/testing/parseItgaurav/family";
@famDocWOListing; #family document without genra listing
$famDocName; #filename for the family doc
$speciesDocName;
$writeFamDoc = 1;
%genDirExist; %specDirExist;
$inListing = 0;
$docLevel = 0; #which level to write doc, 0 being family, 1 being genra
$genDocName;

`rm -f -r $famDocPath/*`;

$level = 0;  #this is used to determine what genra list we are looking at. 0 : family level, 1: genra level
$addLine = 0; #this is used to indicate if we should add link to jump to at level 1
%genra1; #a level 0 hash used to keep track of genra being seen
%genra2; #a level 1 hash
main();
`chmod -R 755 $famDocPath/*`;
sub main
{
 foreach (@blah)
 {
  if(/\.htm/)
  {
    $file2 = $path . $_;
    open (FILEHANDLE, "<$file2");
    @information = <FILEHANDLE>;
    chop($file);
    $file1 = $_;
    chop($file1);
    $ofile = $datapath . $file1 . ".out";
    open(OUTFILE,">$ofile");
    $line = 0;
    foreach(@information)
    {
      #clean up
      s/<SPAN STYLE="text-decoration: underline">(.*?)<\/SPAN>/<i>$1<\/i>/g;
      s/<SPAN STYLE=".*?" STYLE=".*?">(.*?)<\/SPAN>/$1/g;
      s/<SPAN STYLE=".*?">(.*?)<\/SPAN>/$1/g;
      s/<BR WP="BR2">//g;
      s/<SPAN STYLE=".*?>//g;

      #some anomalies in centering text for genra
      s/<P ALIGN="CENTER">(.*?)<\/P>/<CENTER>$1<\/CENTER>/;
      s/<P STYLE="text-align: CENTER">(.*?)<\/P>/<CENTER>$1<\/CENTER>/;
      
      #the rest of the cleaning up
      s/<\/SPAN>//g;
      s/<STRONG><\/STRONG>//g;
      s/<P><\/P>//g;
      s/<i><\/i>//g;
      s/<\/STRONG>\s{0,}<STRONG>//g;
      s/>\s{3,}</></g; 
      #zaps the images
      if(/(<img.*?>)/g)
      {
        if(/\/(\w*\.BMP)\"/)
        {
          $imgname = $1;
          $imgname = lc $imgname;
          $imgname = ucfirst $imgname;
          $imgname =~ s/bmp/jpg/g;
          if($_ !~ /width/ || $_ !~ /height/)
          {
            s/<img.*?\"/<img src=\"$imageloc$imgname\" width=300 height=300/;
          }
          else
          {
            s/<img.*?\"/<img src=\"$imageloc$imgname\"/;
          }
          #$_ = $information[$i];
        }
      }
      #insert tags
      #family tag, can't use /STRONG at the end since it might show up on next line
      if(/<STRONG>([A-Z\/<>]{1,})\s{0,}<\/CENTER>/)
      {
        $level = 0;
        $addLine = 0;
        #somtimes therea re center tags in $1
        $family = $1;
        $family =~ s/<CENTER>//;
        $family =~ s/<\/CENTER>//;
        print OUTFILE "\002\n";
        print OUTFILE "\<\!\-\-\n";
        print OUTFILE "\@T\:$family \n";
        print OUTFILE "\@\n";
        print OUTFILE "\-\-\>\n";

        #seperates family by files
        #prepares famDoc: for sure we'll write this into a file, so no
        #need to test the flag
        $writeFamDoc = 1;
        OutputsFamDoc();
        $docLevel = 0;
        $famDocName = $family;
        $famDocName =~ s/<\/STRONG>//g;

s/<STRONG>([A-Z\/<>]{1,})\s{0,}<\/CENTER>/<STRONG><title>$1<\/title><i><FAMILY>$1<\/FAMILY><\/i><\/CENTER><a family=\"$1\">/i;

      }
      if(/<STRONG>([A-Z]{1,}) \(([A-Z]{1,})\)<\/STRONG>/)
      {
        $level = 0;
        $addLine = 0;
       
        #somtimes therea re center tags in $1
        $family = $1;
        $family =~ s/<CENTER>//;
        $family =~ s/<\/CENTER>//;
        print OUTFILE "\002\n";
        print OUTFILE "\<\!\-\-\n";
        print OUTFILE "\@T\:$family \n";
        print OUTFILE "\@\n";
        print OUTFILE "\-\-\>\n";
        
        #seperates family by files
        #prepares famDoc: for sure we'll write this into a file, so no
        #need to test the flag
        $writeFamDoc = 1;
        OutputsFamDoc();
        $docLevel = 0;
        $famDocName = $family;
        $famDocName =~ s/<\/STRONG>//g;
        # special case for family: <P><CENTER><STRONG>ASTERACEAE (COMPOSITAE)</STRONG></CENTER>
s/<STRONG>([A-Z]{1,})\s{0,}\(([A-Z]{1,})\)<\/STRONG>/<STRONG><i><FAMILY>$1<\/FAMILY><\/i> \($2\)<\/STRONG><a family=\"$1\"><title>$1<\/title>/;
      }
      #for family: <P><STRONG><CENTER>BRASSICACEAE (CRUCIFERAE)</CENTER>
      if(/<STRONG>[<CENTER>]{0,}([A-Z]{1,})\s{0,}\(([A-Z]{1,})\)<\/CENTER>/)
      {
        $level = 0;
        $addLine = 0;
       
        #somtimes therea re center tags in $1
        $family = $1;
        $family =~ s/<CENTER>//;
        $family =~ s/<\/CENTER>//;
        print OUTFILE "\002\n";
        print OUTFILE "\<\!\-\-\n";
        print OUTFILE "\@T\:$family \n";
        print OUTFILE "\@\n";
        print OUTFILE "\-\-\>\n";
        
        #seperates family by files
        #prepares famDoc: for sure we'll write this into a file, so no
        #need to test the flag
        $writeFamDoc = 1;
        OutputsFamDoc();
        $docLevel = 0;
        $famDocName = $family;
        $famDocName =~ s/<\/STRONG>//g;
s/<STRONG>([A-Z]{1,})\s{0,}\(([A-Z]{1,})\)<\/CENTER>/<STRONG><i><FAMILY>$1<\/FAMILY><\/i>\($2\)<\/CENTER><a family=\"$1\"><title>$1<\/title>/;
      }
      #for family: <P><CENTER><STRONG>CAMPANULACEAE  (including Sphenocleaceae)</CENTER>
      if(/<STRONG>([A-Z]{1,})\s{0,}\(([A-Za-z]{1,} [A-Za-z]{1,})\)<\/CENTER>/)
      {
        $level = 0;
        $addLine = 0;
       
        #somtimes therea re center tags in $1
        $family = $1;
        $family =~ s/<CENTER>//;
        $family =~ s/<\/CENTER>//;
        print OUTFILE "\002\n";
        print OUTFILE "\<\!\-\-\n";
        print OUTFILE "\@T\:$family \n";
        print OUTFILE "\@\n";
        print OUTFILE "\-\-\>\n";
        
        #seperates family by files
        #prepares famDoc: for sure we'll write this into a file, so no
        #need to test the flag
        $writeFamDoc = 1;
        OutputsFamDoc();
        $docLevel = 0;
        $famDocName = $family;
        $famDocName =~ s/<\/STRONG>//g;
s/<STRONG>([A-Z]{1,})\s{0,}\(([A-Za-z]{1,} [A-Za-z]{1,})\)<\/CENTER>/<STRONG><i><FAMILY>$1<\/FAMILY><\/i>\($2\)<\/CENTER><a family=\"$1\"><title>$1<\/title>/;
      }
      #for family:
      #<P><CENTER><STRONG>CLUSIACEAE (GUTTIFERAE)--Including Hypericaceae</STRONG></CENTER>
      #another variation:
      if(/<CENTER><STRONG>([A-Z]{1,})\s{0,}\(([A-Z]{1,})\)\s{0,}--\s{0,}([a-zA-Z]{1,})\s{0,}([a-zA-Z]{1,})<\/STRONG><\/CENTER>/)
      {
        #1 is fam, $2 is (blah), $3 is --<something> <$4>
        $level = 0;
        $addLine = 0;
       
        #somtimes therea re center tags in $1
        $family = $1;
        $family =~ s/<CENTER>//;
        $family =~ s/<\/CENTER>//;
        print OUTFILE "\002\n";
        print OUTFILE "\<\!\-\-\n";
        print OUTFILE "\@T\:$family \n";
        print OUTFILE "\@\n";
        print OUTFILE "\-\-\>\n";
        
        #seperates family by files
        #prepares famDoc: for sure we'll write this into a file, so no
        #need to test the flag
        $writeFamDoc = 1;
        OutputsFamDoc();
        $docLevel = 0;
        $famDocName = $family;
        $famDocName =~ s/<\/STRONG>//g;
        #for <P><CENTER><STRONG>CLUSIACEAE (GUTTIFERAE)--Including Hypericaceae</STRONG></CENTER>
s/<CENTER><STRONG>([A-Z]{1,})\s{0,}\(([A-Z]{1,})\)\s{0,}--\s{0,}([a-zA-Z]{1,})\s{0,}([a-zA-Z]{1,})<\/STRONG><\/CENTER>/<CENTER><STRONG><i><FAMILY>$1<\/FAMILY><\/i>\($2\)--$3 $4<\/CENTER><\/STRONG><a family=\"$1\">/;
      }

      
      #differences in formatting of genra tags, so need two cases
      if(/<CENTER><STRONG>(\d{1,}.)\s{0,}([A-Z]{1,})/ || /<STRONG><CENTER>(\d{1,}.)\s{0,}([A-Z]{1,})/
         || /<CENTER>(\d{1,})<STRONG>\.\s{0,}([A-Z]{1,})/)
      {
        s/<CENTER><STRONG>(\d{1,}.)\s{0,}([A-Z]{1,})/<CENTER><title>$famDocName $2<\/title><STRONG>$1 <a name="$famDocName $2"><i><GENRA>$2<\/GENRA><\/i><\/a><a family="$famDocName"><a genus="$2">/;            
        s/<STRONG><CENTER>(\d{1,}.)\s{0,}([A-Z]{1,})/<CENTER><title>$famDocName $2<\/title><STRONG>$1 <a name="$famDocName $2"><i><GENRA>$2<\/GENRA><\/i><\/a><a family="$famDocName"><a genus="$2">/;            
        #yet another anomaly:<P><CENTER>4<STRONG>.  ALNUS  Mill.  </STRONG>Alder</CENTER>
        s/<CENTER>(\d{1,})<STRONG>\.\s{0,}([A-Z]{1,})/<CENTER><title>$famDocName $2<\/title><STRONG>$1<a name="$famDocName $2"><i><GENRA>$2<\/GENRA><\/i><\/a><a family="$famDocName"><a genus="$2">/;
        
        $level = 1;
        $addLine = 0;
        #setups up the flags
        $writeFamDoc = 1;
        OutputsFamDoc();
        $genDocName = $2;
        $docLevel = 1;
      }

      #genra listing tags for level 0, family level       
      if($level == 0)
      {
        #format: <STRONG>6.  <i>Onosmodium</i></STRONG>
        #format 2: <STRONG>21</STRONG>. <STRONG><i>Cirsium</i></STRONG>
        #format 3: <STRONG>35.  <i>Aster</i>(<i>Symphyotrichum</i>)</STRONG>
        #format 4: <STRONG>60.</STRONG> <STRONG><i>Gaillardia</i></STRONG>
        #format 4: a bit problematic, similar to #1, but being broken
        #into two lines
        if(/<STRONG>\d{1,}.\s{0,}<i>([A-Z][a-z]{1,})<\/i><\/STRONG>/
           ||/<STRONG>\d{1,}\s{0,}<\/STRONG>.\s{0,}<STRONG><i>([A-Z][a-z]{1,})<\/i><\/STRONG>/
           ||/<STRONG>\d{1,}.\s{0,}<i>([A-Z][a-z]{1,})<\/i>\s{0,}\(<i>([A-Z][a-z]{1,})<\/i>\)<\/STRONG>/
           ||/<STRONG>\d{1,}.\s{0,}<\/STRONG>\s{0,}<STRONG><i>([A-Z][a-z]{1,})<\/i><\/STRONG>/)

        {
          $temp = $1;
          $temp =~ tr/a-z/A-Z/;

s/<STRONG>(\d{1,}.)\s{0,}<i>([A-Z][a-z]{1,})<\/i><\/STRONG>/<STRONG>$1 <a href="\#$famDocName $temp"><i>$2<\/i><\/a><\/STRONG>/;
s/<STRONG>(\d{1,})\s{0,}<\/STRONG>.\s{0,}<STRONG><i>([A-Z][a-z]{1,})<\/i><\/STRONG>/<STRONG>$1\.<a href="\#$famDocName $temp"><i>$2<\/i><\/a><\/STRONG>/;
s/<STRONG>(\d{1,}.)\s{0,}<i>([A-Z][a-z]{1,})<\/i>\s{0,}\(<i>([A-Z][a-z]{1,})<\/i>\)<\/STRONG>/<STRONG>$1<a href="\#$famDocName $temp"><i>$2 \($3\)<\/i><\/a><\/STRONG>/;
s/<STRONG>(\d{1,}.)\s{0,}<\/STRONG>\s{0,}<STRONG><i>([A-Z][a-z]{1,})<\/i><\/STRONG>/<STRONG>$1<a href="\#$famDocName $temp"><u>$2<\/u><\/a><\/STRONG>/;
          $genra1{$1} = 1;
        }
        #for format 4:
        if(/<STRONG>(\d{1,}.)\s{0,}$/)
        {
          $temp2;
          #grab next line to find name
          $temp = $information[$line + 1];
          #need to do clean up
          $temp =~ s/<SPAN STYLE="text-decoration: underline">(.*?)<\/SPAN>/<i>$1<\/i>/g;
          $temp =~s/<SPAN STYLE=".*?" STYLE=".*?">(.*?)<\/SPAN>/$1/g;
          $temp =~s/<SPAN STYLE=".*?">(.*?)<\/SPAN>/$1/g;
          $temp =~s/<BR WP="BR2">//g;
          $temp =~s/<SPAN STYLE=".*?>//g;

          #some anomalies in centering text for genra
          $temp =~s/<P ALIGN="CENTER">(.*?)<\/P>/<CENTER>$1<\/CENTER>/;
          $temp =~s/<P STYLE="text-align: CENTER">(.*?)<\/P>/<CENTER>$1<\/CENTER>/;

          #the rest of the cleaning up
          $temp =~s/<\/SPAN>//g;
          $temp =~s/<STRONG><\/STRONG>//g;
          $temp =~s/<P><\/P>//g;
          $temp =~s/<i><\/i>//g;
          $temp =~s/<\/STRONG><STRONG>//g;
          $temp =~s/>\s{3,}</></g;

          if($temp=~ /<i>([A-Z][a-z]{1,})<\/i><\/STRONG>/)
          {
            $temp2= $1;
            $temp2 =~ tr/a-z/A-Z/;
            $temp =~s/<i>([A-Z][a-z]{1,})<\/i><\/STRONG>/<i><a href="\#$famDocName $temp2">$1<\/a><\/i><\/STRONG>/;         
            $information[$line+1] = $temp;
          }
        }
        $addLine = 0;
      }
      #patter for level == 1: .....<STRONG>1.  <u>M</u>. <u>petiolata</u></STRONG>
      #need a flag to see if we need to set anchor
      if($level == 1) 
      {
        #patter 1.
        if(/\.{1,}<STRONG>\s{0,}\d{1,}\w{0,}.\s{0,}<i>\s{0,}\w\s{0,}<\/i>.\s{0,}\w{0,}\s{0,}<i>\s{0,}(\w{1,}\-{0,1}\w{0,})\s{0,}<\/i>\s{0,}<\/STRONG>/)
        {
s/\.{1,}<STRONG>\s{0,}(\d{1,}\w{0,}.)\s{0,}<i>\s{0,}(\w)\s{0,}<\/i>.\s{0,}(\w{0,})\s{0,}<i>\s{0,}(\w{1,}\-{0,1}\w{0,})\s{0,}<\/i>\s{0,}<\/STRONG>/...<STRONG>$1<i>$2<\/i>.$3<i><a href="\#$famDocName $genDocName $4">$4<\/a><\/i><\/STRONG>/;
          $addLine = 1;
        }
        #pattern 2:
        #<STRONG>1.  <i>A theophrasti</i></STRONG></P>  
        elsif(/...<STRONG>(\d{1,})\.\s{0,}<i>([A-Z])\s{1,}([A-Za-z]{1,})<\/i>\s{0,}<\/STRONG>\s{0,}<\/P>/)
        {
s/...<STRONG>(\d{1,})\.\s{0,}<i>([A-Z])\s{1,}([a-z]{1,})<\/i>\s{0,}<\/STRONG>\s{0,}<\/P>/...<STRONG>$1<i>$2. <a href="\#$famDocName $genDocName $3">$3<\/a><\/i><\/STRONG>/;

          $addLine = 1;
        }
        #pattern 3
        #...<STRONG>1.  <i>H</i>.</STRONG><i><STRONG>annuus</STRONG></i></P>
        elsif(/\.{1,}<STRONG>\s{0,}\d{1,}\.\s{0,}<i>\w{1}<\/i>\.<\/STRONG>\s{0,}<i><STRONG>[A-Za-z]{1,}<\/STRONG><\/i>/)
        {
s/\.{1,}<STRONG>\s{0,}(\d{1,})\.\s{0,}<i>(\w{1})<\/i>\.<\/STRONG>\s{0,}<i><STRONG>([A-Za-z]{1,})<\/STRONG><\/i>/...<STRONG>$1. <i>$2. <a href="\#$famDocName $genDocName $3">$3<\/a><\/i><\/STRONG>/;
          $addLine = 1;
        }        

        #special case for <i>lanceolatus</i></STRONG>	</P>, since it's
        #broken into two lines
        s/<i>(lanceolatus)<\/i><\/STRONG>\s{0,}<\/P>/<i><a href="\#$famDocName $genDocName $1">$1<\/a><\/i><\/STRONG><\/P>/;
        
      }
      #patter for the above destination: <P><STRONG>1.  <u>M</u>. <u>petiolata</u> 
      if($level == 1 && $addLine == 1) #need a flag here, later.. 
      {


        if(/<P><STRONG>\d{1,}.\s{0,}<i>\s{0,}\w\s{0,}<\/i>.\s{0,}\w{0,}\s{0,}<i>\s{0,}(\w{1,}\-{0,1}\w{0,})\s{0,}<\/i>/)
        {
s/<P><STRONG>(\d{1,}.)\s{0,}<i>\s{0,}(\w)\s{0,}<\/i>.\s{0,}(\w{0,})\s{0,}<i>\s{0,}(\w{1,}\-{0,1}\w{0,})\s{0,}<\/i>/<p><STRONG>$1<i>$2<\/i>. $3 <i><a name="$famDocName $genDocName $4">$4<\/a><\/i><a family="$famDocName"><a genus="$genDocName"><a species="$
4"><title>$famDocName $genDocName $4<\/title>/;

        #setups up the flags
        $writeFamDoc = 1;
        OutputsFamDoc();
        $speciesDocName = $4;
        $docLevel = 2;
        }
        #pattern two anchor 
        # <P><STRONG><i>5.  V</i>. <i>sagittata</i> Ait. var <i>sagittata</i></STRONG>

        if(/<P><STRONG><i>\d{1,}\.\s{0,}[A-Z]{1}<\/i>\.\s{0,}<i>[a-z]{1,}<\/i>/)
        { 

s/<P><STRONG><i>(\d{1,})\.\s{0,}([A-Z]{1})<\/i>\.\s{0,}<i>([a-z]{1,})<\/i>/<p><STRONG>$1 <i>$2. <a name="$famDocName $genDocName $3">$3<\/a><\/i><a family="$famDocName"><a genus="$genDocName"><a species="$3"><title>$famDocName $genDocName $3<\/title>/;
        }
      }
      
      #this test for for famDoc, it is to get rid of listings
      #if any given list has >=2 dots, then it's a listing, don't write
      #could there be any problem with this format?? To be investigated
#      if(/\.{2,}/)
#      {
#        $writeFamDoc = 0;
#      }
      if($writeFamDoc == 1)
      {
        push @famDocWOListing, $_;
      }
      print OUTFILE $_;
      $line++;
    }
    close(FILEHANDLE);
    close(OUTFILE);
  }
 }
 #don't forget to output the last family before exiting :)
 $writeFamDoc = 1;
 OutputsFamDoc();
}
sub OutputsFamDoc
{
  $docLen = @famDocWOListing;
  if($docLen > 1)
  {
    if($docLevel == 0 && length $famDocName > 1)
    {
      open(FAMFILE,">$famDocPath/$famDocName.html");
      print FAMFILE @famDocWOListing;
      close(FAMFILE);
      @famDocWOListing="";
    }
    elsif($docLevel == 1 && length $genDocName > 1) #genra docs
    {
      if(!($genDirExist{$famDocName} == 1))
      {
        
        `mkdir $famDocPath/$famDocName`;
        `chmod 777 $famDocPath/$famDocName/`;
        $genDirExist{$famDocName} = 1;
      }      
      open(GENFILE,">$famDocPath/$famDocName/$genDocName.html");
      print GENFILE @famDocWOListing;
      close(GENFILE);
      @famDocWOListing="";     
    }
    elsif($docLevel == 2 && length $speciesDocName > 1) #species docs
    {
      if(!($specDirExist{$genDocName} == 1))
      {
        `mkdir $famDocPath/$famDocName/$genDocName`;
        $specDirExist{$genDocName} = 1;
      }
      open(SPECFILE,">$famDocPath/$famDocName/$genDocName/$speciesDocName.html");
      print SPECFILE @famDocWOListing;
      close(SPECFILE);
      @famDocWOListing="";
    }

  }

}
