//global variables for this unit

string cVersion="v1.2";
int const maxtax=2000, maxgap=1000, maxfiles=15;
int nfiles=0, outgroup=0;;
string filenames[maxfiles], c_line;
char cread[1024];
bool codegaps=true, allgaps=true;
char gapchar='-', gapAbsent='A', gapPresent='C', gapUnknown='-', unknownChar1='?', unknownChar2='n', unknownChar3='N';


//--------------------------------------------------------------------

// MAIN PROGRAM CODE

void makeNexus (void)
{
    int nchar=0, ntax, lastntax=0, ngap, eh1, eh2, es1, es2;
    unsigned int maxname=0;
    int ctax, gapcount;
    int gaplist[maxgap+1][4], partitions[maxfiles][4];
    bool readcont, skipseq[maxtax];
    string filename1, filename2, temp, cread;
    string taxonlist[maxtax], datamatrix[maxtax], gapmatrix[maxtax], firstocc[maxgap+1];
    string fdatamatrix[maxfiles][maxtax], fgapmatrix[maxfiles][maxtax];
    string infvalgap;

	//holding arrays for results from reading multiple files
    int a_nchar[maxfiles], a_ngap[maxfiles];

	//functions
    int findHardEnd(string seq, int pos, char gapchar);
    int findSoftEnd(string seq, int pos, char gapchar);
    bool evalinf (string cmatrix[maxtax], int ntax, int cpos);

    struct tm *ptr;       // used to display time and date, see cpp reference;
    time_t lt;

    Form1->SaveDialog1->Execute();
    filename2=Form1->SaveDialog1->FileName.c_str();
    ofstream outfile(filename2.c_str());

    for (int cfile=0;cfile<nfiles;cfile++)  {      //main file reading loop
        filename1=Form1->Memo1->Lines->Strings[cfile].c_str();
        ifstream infile(filename1.c_str());
        if (!infile) {
            //file open error
            displaymsg ("file not found");
            return;
        }
        ctax=0;
        //initialize datamatrix, gapmatrix with empty strings
        for (int i=0; i<maxtax; i++)  {
            datamatrix[i]="";
            gapmatrix[i]="";
        }
        infile>>cread;
        while (!infile.eof())  {    //read until eof
            readcont=true;
            while (readcont)  {     // block for skipping initial lines preceeding first ">"
                if (cread.substr(0,1)==">") readcont= false;
                else infile>>cread;
            }
		    if (cfile==0)  {   //only read taxon list first time
                cread.erase(0,1);
                taxonlist[ctax]=cread;
                while (taxonlist[ctax].size()>5)  {    //prune trailing xx, 5 = minimum taxon name size
                    if (taxonlist[ctax][taxonlist[ctax].size()-1]=='x') taxonlist[ctax].erase(taxonlist[ctax].size()-1);
                    else break;
			    }
            }
            readcont=true;
            while (readcont)  {    //read all nucleotid lines
                infile>>cread;
                if (cread.substr(0,1)==">" || infile.eof())  {
                    readcont=false;
                    break;
                }
                else datamatrix[ctax]=datamatrix[ctax]+cread;   //concatenate individual parts (lines) of fasta sequence
            }
            ctax++;
        }
        nchar=datamatrix[0].size();   // length of first sequence in current datamatrix
        ntax=ctax;
        infile.close(); //file reading done, continue with data processing

		if (cfile==0)  {    //find maximum length of taxon names
			maxname=0;
            for (int i=0; i<ntax; i++) if (taxonlist[i].size()>maxname) maxname=taxonlist[i].size();
		}
        if (cfile!=0 && ntax!=lastntax) {   //taxa dimension error
            displaymsg ("ERROR: the number of taxa varies among datasets");
            return;
        }
        lastntax=ntax;   // remember number of taxa for next round

        //CODE GAPS
	    if (codegaps)  {
            ngap=0;
            for (int i=1; i<nchar-1; i++) {   // gap coding loop going through base positions
                for (int j=0; j<ntax; j++) skipseq[j]=false;
                for (int j=0; j<ntax; j++)  {     // look for gaps in j sequences
                    if(datamatrix[j][i]==gapchar) {
                        //found gap
                        if (datamatrix[j][i-1]!=gapchar && datamatrix[j][i-1]!=unknownChar1 && datamatrix[j][i-1]!=unknownChar2 && datamatrix[j][i-1]!=unknownChar3 && !skipseq[j])  {  //gap starts here and has not been coded before
                            eh1 = findHardEnd(datamatrix[j],i,gapchar);
                            es1 = findSoftEnd(datamatrix[j],i,gapchar);
                            if (eh1==es1 && eh1<nchar-1) {   // CHECK that gap has hard end and does not run to hind edge of alignment
                                ngap++;  // one more gap has been coded
                                gaplist[ngap][0]=ngap; gaplist[ngap][1]=i; gaplist[ngap][2]=eh1;
                                firstocc[ngap]=taxonlist[j];

                                //search remaining sequences and score presence of this gap
                                gapcount=0;
                                for (int k=0; k<ntax; k++)  {
                                    eh2=findHardEnd(datamatrix[k],i,gapchar);
                                    es2=findSoftEnd(datamatrix[k],i,gapchar);

                                    // Decision tree  : how to code gap in this sequence
                                    if(datamatrix[k][i]==gapchar) {   //  1.X  FIRST GROUP pos=gap
                                        if(eh2<eh1) gapmatrix[k]+=gapAbsent;   // 1.1. gap ends early
                                        else {
                                            if (eh2==eh1 && es2==eh1) {   // 1.2 identical hard ends
                                                if (datamatrix[k][i-1]==gapchar || datamatrix[k][i-1]==unknownChar1 || datamatrix[k][i-1]==unknownChar2 || datamatrix[k][i-1]==unknownChar3) gapmatrix[k]+=gapUnknown;  // gap starts early or has soft start
                                                else {      // gap present
                                                    gapmatrix[k]+=gapPresent;
                                                    gapcount++;
                                                    skipseq[k]=true;
                                                }
                                            }
                                            else gapmatrix[k]+=gapUnknown;  // 1.3 early soft end or gap longer, start doesnt matter
                                        }
                                    }
                                    else  {
                                        if (datamatrix[k][i]==unknownChar1 || datamatrix[k][i]==unknownChar2 || datamatrix[k][i]==unknownChar3) {    //  2.X  SECOND GROUP pos=unknown
                                            if (eh2<eh1) gapmatrix[k]+=gapAbsent;   // 2.1  gap ends early
                                            else gapmatrix[k]+=gapUnknown;      //   2.2.  gap possibly as long or longer but has soft start
                                        }
                                        else gapmatrix[k]+=gapAbsent;  //  3.X  THIRD GROUP pos=nucelotide, gap absent
                                    }  // end of composite primary if statement
                                }  // end of k loop
                                gaplist[ngap][3]=gapcount;
                                if(evalinf(gapmatrix,ntax,ngap-1)==false) infvalgap+="0";
                                else infvalgap+="1";
                            } // end of IF check for pseudogap running to hind edge of alignment
                        } //end of new gap found (!skipseq)
                    } // end of  if(datamatrix[j][i]==gapchar) {
                }  // end of j loop (ntax)
            }  //end of i loop (nchar)
        } //end of if(codegaps) statement

        //rewrite datamatrix, gapmatrix, and gaplist

        outfile << "[Datafile: " << filename1.c_str() << "]\n";  //datamatrix
        for (int w=0; w<ntax; w++) fdatamatrix[cfile][w]=datamatrix[w];

        for (int i=0; i<ntax; i++) {
            outfile << taxonlist[i];
            for (unsigned int z=0; z<(maxname-taxonlist[i].size())+3;z++) outfile << " ";
            outfile << datamatrix[i] << "\n";  // all nucleotid characters coded
        }
        outfile << "\n\n";

        a_nchar[cfile]=nchar;

        if (codegaps)  {
            outfile << "[GAPMATRIX: absent='" << gapAbsent << "'  present='" << gapPresent;
            outfile << "'  unknown='" << gapUnknown << "']\n";

            for (int i=0; i<ntax; i++)    {
                outfile << taxonlist[i];
                for (unsigned int z=0; z<(maxname-taxonlist[i].size())+3;z++) outfile << " ";
                outfile << gapmatrix[i];
                outfile <<"\n";
            }

            outfile << "\n[GAPLIST: * = gap uninformative]\n";
            outfile << "[ ";
            outfile.width(5); outfile << "no.";
            outfile.width(40); outfile << " base pos.  #occ   first occurrence]\n";
            for (int i=1; i<ngap+1; i++)    {
                outfile << "[ ";
                outfile.width(4); outfile << i << ": ";
                outfile.width(5); outfile << (gaplist[i][1]+1) << " - ";
                outfile.width(4); outfile << (gaplist[i][2]+1) << " ";
                outfile.width(5); outfile << gaplist[i][3];
                if (infvalgap[i-1]=='0') outfile << "*  ";
                else outfile << "   ";
                outfile << firstocc[i] << " ]";
                outfile << "\n";
            }
            outfile << "\n\n";
        }   // end of if(codegaps) statement

        if (codegaps) a_ngap[cfile]=ngap;
        else a_ngap[cfile]=0;

        if (cfile==0) {
            partitions[cfile][0]=1;   //start of nuc
            partitions[cfile][1]=a_nchar[cfile];        //end of nuc
            partitions[cfile][2]=partitions[cfile][1]+1;                 // start of gaps
            partitions[cfile][3]=partitions[cfile][1]+a_ngap[cfile];       // end of gaps
        }
        else {
            partitions[cfile][0]=partitions[cfile-1][3]+1;
            partitions[cfile][1]=partitions[cfile-1][3]+a_nchar[cfile];
            partitions[cfile][2]=partitions[cfile][1]+1;
            partitions[cfile][3]=partitions[cfile][1]+a_ngap[cfile];
        }
        for (int i=0; i<ntax; i++) datamatrix[i]="";

    }   // END of file reading loop

    outfile.close();

	//REWRITE OUTPUT FILE

    //read file into memory
    int length;
    char * buffer;

    ifstream is;
    is.open (filename2.c_str());

    // get length of file:
    is.seekg (0, ios::end);
    length = is.tellg();
    is.seekg (0, ios::beg);

    // allocate memory:
    buffer = new char [length];

    // read data as a block:
    is.read (buffer,length);

    is.close();

    //WRITE FINAL OUTPUT FILE
    outfile.open(filename2.c_str());
    //outfile<<buffer;
    //outfile.close();
    //Form1->RichEdit1->Lines->LoadFromFile(filename2.c_str());
    //return;
    //ofstream finalfile(filename2.c_str());
	int totchar=0;
	for (int i=0;i<nfiles;i++) totchar=totchar+a_nchar[i]+a_ngap[i];
    lt=time(NULL);
    ptr=localtime(&lt);
    temp=asctime(ptr);
    temp.erase(temp.size()-1,temp.size());
    outfile << "#NEXUS\n";
    outfile << "[File written by program FastGap " << cVersion << " -- ";
    outfile << "Time: " << temp << "]\n\n";
    outfile << "BEGIN DATA;\n";
    outfile << "\tDIMENSIONS NTAX=" << ntax << "  NCHAR=" << totchar << ";\n";
    outfile << "\tFORMAT MISSING=";
    if(gapUnknown=='-') outfile << "?";
    else outfile<<gapUnknown;
    outfile << " DATATYPE=DNA GAP=- INTERLEAVE";
    if(gapAbsent=='0') outfile <<" SYMBOLS=\"0 1\"";
    outfile << ";\n";
    outfile << "\tOPTIONS GAPMODE=MISSING;\n";
    outfile << "MATRIX\n";

    //WRITE DATMATRIX BLOCKS IN MEMORY
    if(codegaps)  {
        readcont=true;
        while (readcont) {
            if (buffer[length-1]==']') readcont=false;
            else {
                buffer[length-1]=NULL;
                length--;
            }
        }
    }
    outfile << buffer;
    outfile << "\n;\nEND;\n\n\n";

    // TERMINATE MATRIX BLOCK
    outfile << "BEGIN SETS;\n";
	for (int cfile=0;cfile<nfiles;cfile++)  {
        outfile << "\tcharset region" << (cfile+1) << "_nuc = ";
        outfile << partitions[cfile][0] << "-" << partitions[cfile][1] << ";\n";
        if (codegaps) {
            outfile << "\tcharset region" << (cfile+1) << "_gaps = ";
            outfile << partitions[cfile][2] << "-" << partitions[cfile][3] << ";\n";
        }
	}
    if(codegaps) {
        outfile << "\tcharset gaps = ";
        for (int i=0;i<nfiles;i++) outfile << "region" << (i+1) << "_gaps ";
        outfile <<";";
    }
    outfile << "\nEND;\n\n\n";

    if(outgroup>0)  {
        outfile << "BEGIN PAUP;\n";
        outfile << "\tSet INCREASE=AUTO;\n";
        outfile << "\toutgroup ";
        for (int i=0;i<outgroup-1;i++) outfile << taxonlist[i] << " ";
        outfile << taxonlist[outgroup-1] << ";\n";
        outfile << "END;\n\n\n";
    }
    outfile.close();
    delete buffer;

    Form1->RichEdit1->Lines->LoadFromFile(filename2.c_str());

}


//-------------------------------------------------------------------

int findHardEnd (string sec, int pos, char gapchar)
{
    bool ingap=true;               //check endpos
    int eh;
    while (ingap)  {
        pos++;
        if(sec[pos]==gapchar  || sec[pos]==unknownChar1 || sec[pos]==unknownChar2 || sec[pos]==unknownChar3) ingap = true;
        else ingap=false;
    }
    eh=pos-1;
    return eh;
}

//--------------------------------------------------------------------

 int findSoftEnd (string sec, int pos, char gapchar)
{
    bool ingap=true;               //check endpos
    int es;
    while (ingap)  {
        pos++;
        if(sec[pos]==gapchar) ingap = true;
        else ingap=false;
    }
    es=pos-1;
    return es;
}

//-----------------------------------------------------------------------------


bool evalinf (string cmatrix[maxtax], int ntax, int cpos)  {
    int numAbsent=0, numPresent=0;
    bool result;

    for (int i=0;i<ntax;i++)  {
        if (cmatrix[i][cpos] == gapAbsent) numAbsent++;
        if (cmatrix[i][cpos] == gapPresent) numPresent++;
        //if (cmatrix[i][cpos] == gapUnknown) {numAbsent++; numPresent++;}
    }
    if (numAbsent>1 && numPresent>1) result=true;
    else result=false;
    return result;
}