//global variables for this unit string cVersion="v1.2"; int const maxtax=2000, maxgap=1000, maxfiles=15; int nfiles=0, outgroup=0;; string filenames[maxfiles], c_line; char cread[1024]; bool codegaps=true, allgaps=true; char gapchar='-', gapAbsent='A', gapPresent='C', gapUnknown='-', unknownChar1='?', unknownChar2='n', unknownChar3='N'; //-------------------------------------------------------------------- // MAIN PROGRAM CODE void makeNexus (void) { int nchar=0, ntax, lastntax=0, ngap, eh1, eh2, es1, es2; unsigned int maxname=0; int ctax, gapcount; int gaplist[maxgap+1][4], partitions[maxfiles][4]; bool readcont, skipseq[maxtax]; string filename1, filename2, temp, cread; string taxonlist[maxtax], datamatrix[maxtax], gapmatrix[maxtax], firstocc[maxgap+1]; string fdatamatrix[maxfiles][maxtax], fgapmatrix[maxfiles][maxtax]; string infvalgap; //holding arrays for results from reading multiple files int a_nchar[maxfiles], a_ngap[maxfiles]; //functions int findHardEnd(string seq, int pos, char gapchar); int findSoftEnd(string seq, int pos, char gapchar); bool evalinf (string cmatrix[maxtax], int ntax, int cpos); struct tm *ptr; // used to display time and date, see cpp reference; time_t lt; Form1->SaveDialog1->Execute(); filename2=Form1->SaveDialog1->FileName.c_str(); ofstream outfile(filename2.c_str()); for (int cfile=0;cfile<nfiles;cfile++) { //main file reading loop filename1=Form1->Memo1->Lines->Strings[cfile].c_str(); ifstream infile(filename1.c_str()); if (!infile) { //file open error displaymsg ("file not found"); return; } ctax=0; //initialize datamatrix, gapmatrix with empty strings for (int i=0; i<maxtax; i++) { datamatrix[i]=""; gapmatrix[i]=""; } infile>>cread; while (!infile.eof()) { //read until eof readcont=true; while (readcont) { // block for skipping initial lines preceeding first ">" if (cread.substr(0,1)==">") readcont= false; else infile>>cread; } if (cfile==0) { //only read taxon list first time cread.erase(0,1); taxonlist[ctax]=cread; while (taxonlist[ctax].size()>5) { //prune trailing xx, 5 = minimum taxon name size if (taxonlist[ctax][taxonlist[ctax].size()-1]=='x') taxonlist[ctax].erase(taxonlist[ctax].size()-1); else break; } } readcont=true; while (readcont) { //read all nucleotid lines infile>>cread; if (cread.substr(0,1)==">" || infile.eof()) { readcont=false; break; } else datamatrix[ctax]=datamatrix[ctax]+cread; //concatenate individual parts (lines) of fasta sequence } ctax++; } nchar=datamatrix[0].size(); // length of first sequence in current datamatrix ntax=ctax; infile.close(); //file reading done, continue with data processing if (cfile==0) { //find maximum length of taxon names maxname=0; for (int i=0; i<ntax; i++) if (taxonlist[i].size()>maxname) maxname=taxonlist[i].size(); } if (cfile!=0 && ntax!=lastntax) { //taxa dimension error displaymsg ("ERROR: the number of taxa varies among datasets"); return; } lastntax=ntax; // remember number of taxa for next round //CODE GAPS if (codegaps) { ngap=0; for (int i=1; i<nchar-1; i++) { // gap coding loop going through base positions for (int j=0; j<ntax; j++) skipseq[j]=false; for (int j=0; j<ntax; j++) { // look for gaps in j sequences if(datamatrix[j][i]==gapchar) { //found gap if (datamatrix[j][i-1]!=gapchar && datamatrix[j][i-1]!=unknownChar1 && datamatrix[j][i-1]!=unknownChar2 && datamatrix[j][i-1]!=unknownChar3 && !skipseq[j]) { //gap starts here and has not been coded before eh1 = findHardEnd(datamatrix[j],i,gapchar); es1 = findSoftEnd(datamatrix[j],i,gapchar); if (eh1==es1 && eh1<nchar-1) { // CHECK that gap has hard end and does not run to hind edge of alignment ngap++; // one more gap has been coded gaplist[ngap][0]=ngap; gaplist[ngap][1]=i; gaplist[ngap][2]=eh1; firstocc[ngap]=taxonlist[j]; //search remaining sequences and score presence of this gap gapcount=0; for (int k=0; k<ntax; k++) { eh2=findHardEnd(datamatrix[k],i,gapchar); es2=findSoftEnd(datamatrix[k],i,gapchar); // Decision tree : how to code gap in this sequence if(datamatrix[k][i]==gapchar) { // 1.X FIRST GROUP pos=gap if(eh2<eh1) gapmatrix[k]+=gapAbsent; // 1.1. gap ends early else { if (eh2==eh1 && es2==eh1) { // 1.2 identical hard ends if (datamatrix[k][i-1]==gapchar || datamatrix[k][i-1]==unknownChar1 || datamatrix[k][i-1]==unknownChar2 || datamatrix[k][i-1]==unknownChar3) gapmatrix[k]+=gapUnknown; // gap starts early or has soft start else { // gap present gapmatrix[k]+=gapPresent; gapcount++; skipseq[k]=true; } } else gapmatrix[k]+=gapUnknown; // 1.3 early soft end or gap longer, start doesnt matter } } else { if (datamatrix[k][i]==unknownChar1 || datamatrix[k][i]==unknownChar2 || datamatrix[k][i]==unknownChar3) { // 2.X SECOND GROUP pos=unknown if (eh2<eh1) gapmatrix[k]+=gapAbsent; // 2.1 gap ends early else gapmatrix[k]+=gapUnknown; // 2.2. gap possibly as long or longer but has soft start } else gapmatrix[k]+=gapAbsent; // 3.X THIRD GROUP pos=nucelotide, gap absent } // end of composite primary if statement } // end of k loop gaplist[ngap][3]=gapcount; if(evalinf(gapmatrix,ntax,ngap-1)==false) infvalgap+="0"; else infvalgap+="1"; } // end of IF check for pseudogap running to hind edge of alignment } //end of new gap found (!skipseq) } // end of if(datamatrix[j][i]==gapchar) { } // end of j loop (ntax) } //end of i loop (nchar) } //end of if(codegaps) statement //rewrite datamatrix, gapmatrix, and gaplist outfile << "[Datafile: " << filename1.c_str() << "]\n"; //datamatrix for (int w=0; w<ntax; w++) fdatamatrix[cfile][w]=datamatrix[w]; for (int i=0; i<ntax; i++) { outfile << taxonlist[i]; for (unsigned int z=0; z<(maxname-taxonlist[i].size())+3;z++) outfile << " "; outfile << datamatrix[i] << "\n"; // all nucleotid characters coded } outfile << "\n\n"; a_nchar[cfile]=nchar; if (codegaps) { outfile << "[GAPMATRIX: absent='" << gapAbsent << "' present='" << gapPresent; outfile << "' unknown='" << gapUnknown << "']\n"; for (int i=0; i<ntax; i++) { outfile << taxonlist[i]; for (unsigned int z=0; z<(maxname-taxonlist[i].size())+3;z++) outfile << " "; outfile << gapmatrix[i]; outfile <<"\n"; } outfile << "\n[GAPLIST: * = gap uninformative]\n"; outfile << "[ "; outfile.width(5); outfile << "no."; outfile.width(40); outfile << " base pos. #occ first occurrence]\n"; for (int i=1; i<ngap+1; i++) { outfile << "[ "; outfile.width(4); outfile << i << ": "; outfile.width(5); outfile << (gaplist[i][1]+1) << " - "; outfile.width(4); outfile << (gaplist[i][2]+1) << " "; outfile.width(5); outfile << gaplist[i][3]; if (infvalgap[i-1]=='0') outfile << "* "; else outfile << " "; outfile << firstocc[i] << " ]"; outfile << "\n"; } outfile << "\n\n"; } // end of if(codegaps) statement if (codegaps) a_ngap[cfile]=ngap; else a_ngap[cfile]=0; if (cfile==0) { partitions[cfile][0]=1; //start of nuc partitions[cfile][1]=a_nchar[cfile]; //end of nuc partitions[cfile][2]=partitions[cfile][1]+1; // start of gaps partitions[cfile][3]=partitions[cfile][1]+a_ngap[cfile]; // end of gaps } else { partitions[cfile][0]=partitions[cfile-1][3]+1; partitions[cfile][1]=partitions[cfile-1][3]+a_nchar[cfile]; partitions[cfile][2]=partitions[cfile][1]+1; partitions[cfile][3]=partitions[cfile][1]+a_ngap[cfile]; } for (int i=0; i<ntax; i++) datamatrix[i]=""; } // END of file reading loop outfile.close(); //REWRITE OUTPUT FILE //read file into memory int length; char * buffer; ifstream is; is.open (filename2.c_str()); // get length of file: is.seekg (0, ios::end); length = is.tellg(); is.seekg (0, ios::beg); // allocate memory: buffer = new char [length]; // read data as a block: is.read (buffer,length); is.close(); //WRITE FINAL OUTPUT FILE outfile.open(filename2.c_str()); //outfile<<buffer; //outfile.close(); //Form1->RichEdit1->Lines->LoadFromFile(filename2.c_str()); //return; //ofstream finalfile(filename2.c_str()); int totchar=0; for (int i=0;i<nfiles;i++) totchar=totchar+a_nchar[i]+a_ngap[i]; lt=time(NULL); ptr=localtime(<); temp=asctime(ptr); temp.erase(temp.size()-1,temp.size()); outfile << "#NEXUS\n"; outfile << "[File written by program FastGap " << cVersion << " -- "; outfile << "Time: " << temp << "]\n\n"; outfile << "BEGIN DATA;\n"; outfile << "\tDIMENSIONS NTAX=" << ntax << " NCHAR=" << totchar << ";\n"; outfile << "\tFORMAT MISSING="; if(gapUnknown=='-') outfile << "?"; else outfile<<gapUnknown; outfile << " DATATYPE=DNA GAP=- INTERLEAVE"; if(gapAbsent=='0') outfile <<" SYMBOLS=\"0 1\""; outfile << ";\n"; outfile << "\tOPTIONS GAPMODE=MISSING;\n"; outfile << "MATRIX\n"; //WRITE DATMATRIX BLOCKS IN MEMORY if(codegaps) { readcont=true; while (readcont) { if (buffer[length-1]==']') readcont=false; else { buffer[length-1]=NULL; length--; } } } outfile << buffer; outfile << "\n;\nEND;\n\n\n"; // TERMINATE MATRIX BLOCK outfile << "BEGIN SETS;\n"; for (int cfile=0;cfile<nfiles;cfile++) { outfile << "\tcharset region" << (cfile+1) << "_nuc = "; outfile << partitions[cfile][0] << "-" << partitions[cfile][1] << ";\n"; if (codegaps) { outfile << "\tcharset region" << (cfile+1) << "_gaps = "; outfile << partitions[cfile][2] << "-" << partitions[cfile][3] << ";\n"; } } if(codegaps) { outfile << "\tcharset gaps = "; for (int i=0;i<nfiles;i++) outfile << "region" << (i+1) << "_gaps "; outfile <<";"; } outfile << "\nEND;\n\n\n"; if(outgroup>0) { outfile << "BEGIN PAUP;\n"; outfile << "\tSet INCREASE=AUTO;\n"; outfile << "\toutgroup "; for (int i=0;i<outgroup-1;i++) outfile << taxonlist[i] << " "; outfile << taxonlist[outgroup-1] << ";\n"; outfile << "END;\n\n\n"; } outfile.close(); delete buffer; Form1->RichEdit1->Lines->LoadFromFile(filename2.c_str()); } //------------------------------------------------------------------- int findHardEnd (string sec, int pos, char gapchar) { bool ingap=true; //check endpos int eh; while (ingap) { pos++; if(sec[pos]==gapchar || sec[pos]==unknownChar1 || sec[pos]==unknownChar2 || sec[pos]==unknownChar3) ingap = true; else ingap=false; } eh=pos-1; return eh; } //-------------------------------------------------------------------- int findSoftEnd (string sec, int pos, char gapchar) { bool ingap=true; //check endpos int es; while (ingap) { pos++; if(sec[pos]==gapchar) ingap = true; else ingap=false; } es=pos-1; return es; } //----------------------------------------------------------------------------- bool evalinf (string cmatrix[maxtax], int ntax, int cpos) { int numAbsent=0, numPresent=0; bool result; for (int i=0;i<ntax;i++) { if (cmatrix[i][cpos] == gapAbsent) numAbsent++; if (cmatrix[i][cpos] == gapPresent) numPresent++; //if (cmatrix[i][cpos] == gapUnknown) {numAbsent++; numPresent++;} } if (numAbsent>1 && numPresent>1) result=true; else result=false; return result; }