VERBOSITY_LEVEL = -1;
/* This is an example HY-PHY Batch File.



   It reads a 3 taxa dataset "data/3.seq", performs

   an HKY85 relative rate analysis on the data.

   Having finished that, the code simulates 100 replicates of the data

   using MLE of the parameters under the null hypothesis of equal  

   rates, conducts an HKY relative rate analysis on each of the 

   replicates and then tabulates the distribution of resulting

   likelihood ratio statistic.

   

   

   Sergei L. Kosakovsky Pond and Spencer V. Muse 

   December 1999. 

*/



/* 1. Read in the data and store the result in a DataSet variable.*/



DataSet 		nucleotideSequences = ReadDataFile ("data/3.seq");

   

/* 2. Filter the data, specifying that all of the data is to be used

	  and that it is to be treated as nucleotides. */

	  

DataSetFilter	filteredData = CreateFilter (nucleotideSequences,1);



/* 3. Collect observed nucleotide frequencies from the filtered data. observedFreqs will

	  store the vector of frequencies. */



HarvestFrequencies (observedFreqs, filteredData, 1, 1, 1);



/* 4. Define the HKY substitution matrix. '*' is defined to be -(sum of off-diag row elements) */



HKY85RateMatrix  = 

		{{*,trvs,trst,trvs}

		 {trvs,*,trvs,trst}

		 {trst,trvs,*,trvs}

		 {trvs,trst,trvs,*}};

		 

/*5.  Define the HKY85 model, by combining the substitution matrix with the vector of observed (equilibrium)

	  frequencies. */

	  

Model HKY85	 = (HKY85RateMatrix, observedFreqs);



/*6.  Now we can define the simple three taxa tree. */

	  

Tree	threeTaxaTree = (a,b,og);



/*7.  Since all the likelihood function ingredients (data, tree, equilibrium frequencies) have been defined we are ready to construct the likelihood function. */

	  

LikelihoodFunction  theLnLik = (filteredData, threeTaxaTree);



/*8.  Maximize the likelihood function, storing parameter values in the matrix paramValues. We also store the resulting ln-lik. */



Optimize (paramValues, theLnLik);

unconstrainedLnLik = paramValues[1][0];



fprintf  (stdout, "\n----ORIGINAL DATA----");

		 

fprintf  (stdout, "\n 0).UNCONSTRAINED MODEL:", theLnLik);



/*10. We now constrain the rate of evolution to be equal along the branches leading to a and b and repeat the optimization. We will impose the constraint: both rates are equal.

*/

	  

threeTaxaTree.b.trst := threeTaxaTree.a.trst;

threeTaxaTree.b.trvs := threeTaxaTree.a.trvs;

Optimize (paramValues, theLnLik);

obslnlikDelta = 2 (unconstrainedLnLik-paramValues[1][0]);

/* 2df test since we constrain ts and tv rates.... */

obsPval = 1-CChi2 (obslnlikDelta, 2); 



fprintf (stdout, "\n\t 1). Both rates test. LR Statistic = ", obslnlikDelta,"\n", theLnLik,"\n  p-val (Chi-2) = ",obsPval,"\n\n");







/*11. Now we set up the simulation loop.

      First, we create another copy of the tree which will

	serve as the tree for simulated data sets. Since we will also use 	observed frequencies from the simulated data sets in our 	substitution model, we create a new Model. We will create a 	vector simFreqs to store the frequencies (the values will be 	saved in simFreqs inside the sim loop. We will count the number 	of significant LR tests using sigtests.*/

	

simFreqs = {{0.25,0.25,0.25,0.25}}; /* Just needed to create simFreqs*/

Model simHKY85	 = (HKY85RateMatrix, simFreqs);

Tree	simulatedTree = (a,b,og);

sigtests = 0;



/*12. This is a formatting stepm which sets print width for all numbers to 12 and prints the table header */



PRINT_DIGITS = 12;



fprintf (stdout, 

"\n|------------|------------|\n|  Simul. #  |LR Test Stat|\n|------------|------------|");



for (simCounter = 1; simCounter<=100; simCounter = simCounter+1)

{

/*13. Simulate a data set of the same size as the original set using the constrained MLE (the current values are still attached to theLnLik)of all the parameters */



	DataSet	simulatedData = SimulateDataSet (theLnLik);



/*14. Repeat the same steps as for the original data to obtain simulated ln-likelihood ratios*/

		  

	DataSetFilter    filteredSimData = CreateFilter(simulatedData,1);



/*15. Collect observed nucleotide frequencies from the filtered data. observedFreqs will store the vector of frequencies. */



	HarvestFrequencies (simFreqs, filteredSimData, 1, 1, 1);

	LikelihoodFunction simLik = (filteredSimData, simulatedTree);

	Optimize (simParamValues, simLik);

	unconstrainedLnLik = simParamValues[1][0];

	fprintf (stdout, "\n|", simCounter);

	/* Rate test */

	simulatedTree.b.trst := simulatedTree.a.trst;

	simulatedTree.b.trvs := simulatedTree.a.trvs;

	Optimize (paramValues, simLik);

	lnlikDelta = 2 (unconstrainedLnLik-paramValues[1][0]);

      if (lnlikDelta > obslnlikDelta) 

	{

		sigtests = sigtests + 1;

	}

	fprintf (stdout, "|", lnlikDelta,"|");

	ClearConstraints (simulatedTree);



}



fprintf (stdout, "\n|------------|------------|");

fprintf (stdout, "\n|Chi2 P-val= |",obsPval,"|");

fprintf (stdout, "\n| PBS P-val= |",sigtests/100,"|");

fprintf (stdout, "\n|------------|------------|");



   

