######################################################################################
# GeneEvolve version 0.70                                                            #
# AGENT BASED MODEL FOR EVOLVING AND CREATING EXTENDED PEDIGREE & TWIN DATA          #
# by Matthew C Keller, updated 11/19/2007                                            #
######################################################################################


#TO RUN: 1) open the GeneEvolve.zip file in your working directory - all files must remain in that directory.
#        2) make changes to the parameters below by placing your desired inputs to the *right* of the arrows (<-).
#        3) advanced users might consider changing parameters under section 1.5
#        4) run entire script, either in BATCH mode (e.g., R CMD BATCH GeneEvolve69.R) or by highlighting the whole thing and running it in an R session.


remove(list=ls())

#1 USER INPUT PARAMETERS -           
#####################################################################################################################################
working.directory <- "C:/temp/GeneEvl"     #working directory where all files saved
save.gen.data <- "no"           #do you want to write out the datasets for each generation? Answer "no" unless you want a detailed ancestral record
make.graphics <- "yes"           #do you want to create a full graphical report at the end of each run?
sib.cor <- "yes"                 #do you want R to compute the correlations b/w sibs for each generation?

#DEMOGRAPHIC DETAILS:
number.generations <- 20                  #number of generations to evolve before population makes twin/spouse parents, twins, sibs, spouses, & twin children
start.pop.size <- 500                   #breeding population size at start of simulation; should be > 100 to insure stability; to mimic ozva60k = 70500
pop.growth <- rep(1,number.generations)  #vector of length number.generations that tells how big each generation is relative to the one before it;
                                         #for no growth: rep(1,number.generations); for constant 5% growth: rep(1.05,number.generations)               

#MODEL DETAILS:
number.genes <- 5                #number of genes affecting phenotype; Must be > 2 but <= 50
am.model <- "I"                  # "I" = "primary phenotypic assortment" - correlation b/w mates due to their choosing similar phenotypes
                                 # "II" = "social homogamy" - mating similarity comes through correlated environmental factors
                                 # "III" = "familial social homogamy" - mating similarity comes only through correlated "F" (familial) factor

#DATASET PARAMETERS
percent.mz <- .44                #pct of twins who are MZ (in the last generation); to mimic ozva60k = .44
sibling.age.cor <- .5            #correlation between siblings' ages within a breeding cohort. Higher correlations imply less spacing between sibs and/or twins.
range.pop.age <- c(18,80)        #range of age in the population over which the beta terms below explain the AGE and A.by.AGE variation; recommend placing this the same as the first two entries in range.age.twin
range.dataset.age <- c(14,90)    #The full datasets include not only twins, but also offspring,sibs,spouses, and parents. This is the range of ages in that full dataset
range.twin.age <- c(18,80)       #range of twin ages in final dataset at each timepoint; VECTOR twice as long as # of measurements
                                 #example 1, for data of twins measured at exactly 14 and 18: range.twin.age <- c(14,14,18,18)
                                 #example 2, for data of adult twins of all ages measured once: range.twin.age <- c(18,100)

#VARIANCES:                      #var terms of 1st gen (they may change thereafter). Its nice, but unnecessary, for them to sum to 1
#Genetic Factors:
A  <- .3                         #var acct for by A (additive genetic variance); also, variance of the intercepts (or levels) of A
AA <- .05                         #var acct for by AxA (additive-by-additive) epistasis. 
D  <- .05                         #var acct for by D (dominance genetic variance)
#Environmental Factors:
U <- .2                          #var acct for by U (unique variance). Note: in non-twins, E (eg, the E from ACE models)=U+MZ+T
MZ <- .0                          #var acct for by special MZ twin env (this goes into E for non-mz individuals)
TW <- .0                         #var acct for by special twin env (this goes into E for non-twin individuals)
F <- .1                          #var acct for by F (familial environment). This is passed from parents to offspring via vertical transmission
S <- .1                         #var acct for by S (sibling environment). Note: in only children, this is part of E.
#Covariates & Moderators
AGE <- .0                        #var acct for by age; This term can be negative; sqrt(abs(AGE))= beta(age) = change in phenotype over 1 SD of age (15-45); may be negative
SEX <- .0                        #var acct for by sex (female=1,male=0); This term can be negative; sqrt(abs(SEX))= beta(sex) = change in phenotype over 1 SD of sex; may be negative
A.by.AGE <- .1                   #var acct for by interaction bw A & Age; Also, the var of slopes of A across age range specified in range.pop.age
A.by.SEX <- .1                   #var acct for by interaction bw A & Sex; Also, the var of slopes of A across gender. 
A.by.S <- .0                      #var acct for by interaction bw A & S (sib env).
A.by.U <- .0                      #var acct for by interaction bw A & U (unique env).

#COVARIANCES:                    #Note: when there are covariances between components, phenotypic variance will probably not begin the simulation at 1
latent.AM <- rep(.3,number.generations) #VECTOR of length = number.generations; correlation bw spouses' latent mating phenotype each generation, as determined by am.model.
                                        #NOTE: the actual phenotypic correlation bw spouses is <= to this
A.S.cor  <- .0                    #correlation between A & S - caused by ACTIVE g-e covariance, or people choosing their (sib) environments
A.U.cor <- .0                     #correlation between A & U - caused by ACTIVE g-e covariance, or people choosing their (unique) environments
R.Alevel.Aslope.age <- .0         #corr bw intercept & slope. Should be between -1 & 1. Consider making = 1, -1, or 0 for cleaner interpretation.
R.Alevel.Aslope.sex <- .0         #corr bw intercept & slope. Should be between -1 & 1. Consider making = 1, -1, or 0 for cleaner interpretation.
R.Alevel.Aslope.S <- .0           #corr bw intercept & slope. Should be between -1 & 1. Consider making = 1, -1, or 0 for cleaner interpretation.
R.Alevel.Aslope.U <- .0           #corr bw intercept & slope. Should be between -1 & 1. Consider making = 1, -1, or 0 for cleaner interpretation.
                                 #Note on R.Alevel.Aslope.X: these are the corr's between the intercepts and slopes of A;
                                 #e.g., do people with the highest A values also have the highest positive changes in A across X?
                                 #if this is 1 (-1), A.by.X becomes the scalar interaction coefficient - i.e., the PURCEL model where h2 increases (decreases) as X increases
                                 #if this is 0, A.by.X becomes nonscalar - i.e., h2 remains ~ constant but diff genes turn on at diff values of X
                                 #WARNING: for scalar interactions, including only interaction var with no main effect var of A will not produce what is desired!
####################################################################################################################################













####################################################################################################
#Start (NO NEED TO ALTER THE SCRIPT BELOW THIS LINE)
#####################################################################################################

#1.5 OPTIONS FOR ADVANCED USAGE (RUNNING MX, SAVING ALL OBJECTS FOR DEBUGGING, ETC)

############################################################################
#mx.location <-  "/home/genetics/genetics/matthew/bin/mxt"  #full path of location of mx (unix server example)
run.cascade.model <- "no"
run.stealth.model <- "no"
run.corr <- "no" 
real.missing <- "no"                #do you want missingness patterns to emulate the ozva60k?
number.runs <- 1                   #do multiple runs of GeneEvolve - useful if running MX to find power, bias, and var/covar of parameter estimates
continuation <- "no"              #is this a continuation of a job that stopped and that you are now restarting? Assures the old Parameter.Comparison file is not overwritten
save.rdata <-  "yes"                #do you want to save the .RData file?
save.objects <- "no"             #do you want to save all the R objects created? Generally, you should answer "no" unless you're debugging. Options: "yes" or "no".

gene.model <- "rare"             #this must be equal to "common" or "rare"; most people should choose "rare"!!! If "common", total variance will not usually be sum of var components due to cor of var components (e.g., A & D)
number.alleles <- 2              #in the "common" model ONLY, how many alleles per locus? 


#convergence <- rep(0,number.generations) #***NOT YET IMPLEMENTED***; a VECTOR of length = PAR$number.generations; corr bw spouses due to PAR$convergence after marriage
#working.directory <- "/lisa_vg4_lv1/HOME/matthew/PedEvolve/P39"      #working directory where all files saved
#working.directory <- "C:/Matts Folder/RESEARCH/PedEvolve/PE47"     #working directory where all files saved 
#working.directory <- "/temp"                    #working directory where all files saved
#mx.location <- "C:/Program Files/MX/bin/MXE.EXE"  #full path of location of mx (windows example)
mx.location <-  "/usr/local/bin/mxt"  #full path of location of mx (unix server example)
############################################################################






#2 LOAD FUNCTIONS USED IN SCRIPT & CREATE LISTS OF PARAMETER VALUES ABOVE TO ENTER INTO FUNCTIONS

############################################################################
#2.0 LOAD FUNCTIONS
#change working directory
setwd(working.directory)

#load all PE functions
source("GE.Functions_Short.R")
source("GE.Function_AM.R")
source("GE.Function_Reproduce.R")
source("GE.Function_MakeDataFile.R")
source("GE.Function_MakePhenotype.R")
source("GE.Function_Run.Gene.Evolve71.R")
source("GE.Function_RunCascade.R")
source("GE.Function_Graph.R")

 
#make PAR and VAR lists; remove unnecessary variables
vars <- c("A", "A.S.cor","A.U.cor","A.by.AGE","A.by.S","A.by.U","AA","AGE","D","F","MZ","S","TW","U","SEX","A.by.SEX","R.Alevel.Aslope.S", 
"R.Alevel.Aslope.U","R.Alevel.Aslope.age","R.Alevel.Aslope.sex","am.model","latent.AM","make.graphics","mx.location",
"number.generations","number.genes","percent.mz","pop.growth","range.dataset.age","range.pop.age",     
"range.twin.age","save.gen.data","save.objects","sib.cor","sibling.age.cor","start.pop.size","working.directory",
 "run.cascade.model","run.stealth.model","gene.model","number.alleles","real.missing","save.rdata","number.runs","continuation","run.corr")

PAR1 <- list(working.directory=working.directory, save.gen.data=save.gen.data, save.objects=save.objects,
            sib.cor=sib.cor, mx.location=mx.location,make.graphics=make.graphics, number.generations=number.generations,
            start.pop.size=start.pop.size, pop.growth=pop.growth, number.genes=number.genes,am.model=am.model, percent.mz=percent.mz,
            range.dataset.age=range.dataset.age, sibling.age.cor=sibling.age.cor,range.twin.age=range.twin.age, range.pop.age=range.pop.age,
            run.cascade.model=run.cascade.model, run.stealth.model=run.stealth.model, gene.model=gene.model, number.alleles=number.alleles,
            real.missing=real.missing, save.rdata=save.rdata,number.runs=number.runs,continuation=continuation,run.corr=run.corr)
          
VAR1 <- list(A=A, AA=AA, D=D,F=F, S=S, U=U, MZ=MZ, TW=TW, SEX=SEX, AGE=AGE, A.by.SEX=A.by.SEX,A.by.AGE=A.by.AGE, A.by.S=A.by.S, A.by.U=A.by.U,
            latent.AM=latent.AM, A.S.cor=A.S.cor, A.U.cor=A.U.cor, R.Alevel.Aslope.sex=R.Alevel.Aslope.sex,R.Alevel.Aslope.age=R.Alevel.Aslope.age,
            R.Alevel.Aslope.S=R.Alevel.Aslope.S, R.Alevel.Aslope.U=R.Alevel.Aslope.U)

remove(list=vars);remove(vars)


#record time
PAR1$t1 <- Sys.time()

############################################################################






############################################################################
#3 RUN GENEVOLVE, RUN CASCADE & CREATE GRAPHICS

for (iteration in 1:PAR1$number.runs){

 #Run GeneEvolve
 ALL <- run.gene.evolve(PAR=PAR1,VAR=VAR1)

 gc()
   
 #Run Cascade model
 MX <- run.cascade.mx(ALL.PARS=ALL$PAR, iter=iteration, PE.Var2=ALL$PE.Var2,PE.Var3=ALL$PE.Var3,rel.corr=ALL$rel.correlations,
                      filenames=ALL$TEMP$name.date, data.info=ALL$TEMP$info.widedata.updated, cont=PAR1$continuation, run.twice=TRUE)

 gc()
 
 #Generate Graphics - running function "Generate.Graphics" which is defined in script "GE.Graph.1.R"
 if (PAR1$make.graphics=="yes") {
  ge.graphics(date.names=ALL$TEMP$name.date, PARAMS=ALL$PAR, VARIANCES=VAR1, MX.info=MX, 
              PE.Var3=ALL$PE.Var3, PE.Var2=ALL$PE.Var2, rel.correlations=ALL$rel.correlations, 
              track.changes=ALL$track.changes, data.info=ALL$TEMP$info.widedata.updated)}

}
    


#Save *.RData file if save.rdata=="yes" - names it by date
if (PAR1$save.rdata=="yes"){ save.image(file=paste(ALL$TEMP$name.date,".RData",sep="")) }

############################################################################



                                             


#Sample Times for how long script takes:
#Oct 18 2007. MacBook Pro 2.4 GHz Core2 Duo 4GB 667 MHz DDR2 SDRAM, 800 FSB: 16.5 min for:
#   number.generations <- 50              
#   start.pop.size <- 30000           
#   number.genes <- 40