library(MASS)
library(lattice)
source("boulder_library.R")

#-----------------------------------------------------------
# Looking at an F2 cross where the QTL is one of the markers
#-----------------------------------------------------------

# read marker data
marker.data <- read.delim("f2.markers")
head(marker.data)

# read genetic data using our function
ped  <- read.ped("f2.ped", marker.names = marker.data$name)
head(ped)

# test marker association of qtl to genotype

# fit a model to a marker that's not the qtl
fit <- lm(Phenotype ~ m1, data=ped)
fit
str(fit)


# test whether m1 ontributes significantly
fit0 <- lm(Phenotype ~ 1, data=ped)
fit0

# compare the two models
fit.anova <- anova(fit0, fit)
fit.anova

# a short cut to doing this is to just give anova() the bigger model
fit.anova <- anova(fit)
fit.anova

# fit a linear model, store the results in fit, look at the results
fit <- lm(Phenotype ~ q1, data=ped)


fit.anova <- anova(fit)
fit.anova

# check for dominance
# first look at how the additive model is coded
ped$q1

# easier to just print out, say, the first 10 values
ped$q1[1:10]

# to get each of these numbers to a number representing dominance
# we need to generate new numbers equal to 1-x^2
# eg, if x is -1 then

x <- -1
1 - x^2

# we can do this to many observations at a time
1 - (ped$q1[1:10])^2

# and even write a function to do it for us
# Note that g is just a placeholder for whatever you give the function

dominance.part <- function(g) { return (1-g^2) }

# now we can use this function in R window
dominance.part(-1)

dominance.part(0)
dominance.part(1)

dominance.part(ped$q1[1:10])

# fit each model
fit0 <- lm(Phenotype ~ 1, data=ped)
fit1 <- lm(Phenotype ~ q1, data=ped)
fit2 <- lm(Phenotype ~ q1 + dominance.part(q1), data=ped)

# do comparison of nested models (aka nested anova)
anova(fit0, fit1, fit2)
# alternatively do it with single argument version of anova()
anova(fit2)

# compare no effect against full model (additive and dominance)
anova(fit0, fit2)

# extract the pvalue from the anova table
pval <- fit.anova["q1", "Pr(>F)"]
pval

# work out the logP
logP <- -log10(pval)
logP

# work out the logP for all markers and store the results in a
# table "chrom.scan"
scan.markers(Phenotype ~ MARKER, data=ped, marker.data=marker.data)

chrom.scan <- scan.markers(Phenotype ~ MARKER, data=ped, marker.data=marker.data)
chrom.scan

plot(logP ~ cM, data=chrom.scan, type="b")

#------------------------------
# what should be our threshold?
#------------------------------

# pval = 0.05?
simple.threshold <- -log10(0.05)
abline(h=simple.threshold, col="red")

# bonferroni correct pval = 0.05 / number of markers
bonferroni.threshold <- -log10(0.05/21)
abline(h=bonferroni.threshold, col="blue")

# Best way is to calculate threshold by permutation:
# randomly shuffle the phenotype values of the animals
# and redo the chromosome scan. Do this many times
# to work out what logP value is exceeded 0.05 or less of the time.

# The sample() function shuffles a sequence, eg,
x <- c(1,2,3,4,5)
sample(x)
sample(x)
sample(x)

# make a copy of ped data
perm.ped <- ped
# shuffle the phenotypes so many animals have other animals phenotypes
perm.ped$Phenotype <- sample(ped$Phenotype)
# rescan
perm.scan <- scan.markers(Phenotype ~ MARKER, data=perm.ped, marker.data=marker.data)
perm.scan

# all the numbers look quite low
# the maximum is...
max(perm.scan$logP)

# do this many times, eg, 200
num.permutations <- 200
max.logPs <- numeric(num.permutations)
for (i in 1:num.permutations)
{
    # permutation step
    perm.ped$Phenotype <- sample(ped$Phenotype)
    perm.scan <- scan.markers(Phenotype ~ MARKER, data=perm.ped, marker.data=marker.data)
    max.logPs[i] <- max(perm.scan$logP)

    # show progress by printing output for each permutation
    cat("Permuting ", i, "/", num.permutations, ": max=", max.logPs[i], "\n")
    flush.console()
}

# calculate the logP that is exceeded only 0.05 of the time
# (should be about 2.2)
threshold <- quantile(max.logPs, 1 - 0.05)
threshold
abline(h=threshold, col="green")

# look at the distribution of max logPs obtained in the permutations
hist(max.logPs)
abline(v=c(simple.threshold, bonferroni.threshold, threshold), col=c("red", "blue", "green"))
# they exceed the simple uncorrected threshold almost half the time!