Genomic Prediction example using Usefulness of Multiparental Populations of Maize...

#*********UN-COMMENT and RUN THIS CHUNK ONLY THE FIRST TIME*********#
#tip to uncomment: select chunk and press cmd + shift + c

# Here I am using the Dent set and DMY measurements from http://www.genetics.org/content/198/1/3.figures-only
# to predict DHs 

##########
# Initialize (make directory changes here for easy reproducibility)
#########

# #clear environment
# rm(list = ls())
# 
# #make project directory
# system("mkdir -p ~/Desktop/maize-multiparent/data")
# 
# #move to data directory 
# setwd("~/Desktop/maize-multiparent/data")
# 
# #Download all relevant data files
# #Phenotypes
# system("curl http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4174941/bin/ac84ad1c1bdc1353a057ad4368f46681_genetics.114.161943-17.zip > Phenotypes.zip")
# system("unzip Phenotypes.zip")
# system("rm Phenotypes.zip")
# 
# #Dent genotypes (CFD file)
# system("curl ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE50nnn/GSE50558/suppl/GSE50558%5FCFD%5Fmatrix%5FGEO%2Etxt%2Egz > CFD.txt.gz")
# 
# #Flint genotypes (CFF file)
# system("curl ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE50nnn/GSE50558/suppl/GSE50558%5FCFF%5Fmatrix%5FGEO%2Etxt%2Egz > CFF.txt.gz")
# 
# #Parental genotypes (Parental file)
# system("curl ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE50nnn/GSE50558/suppl/GSE50558%5FParental%5Fmatrix%5FGEO%2Etxt%2Egz > Parental.txt.gz")
# 
# #------------
# # Data cleanup (quick and dirty)
# #------------
# 
# #--------
# # Genotype files
# #---------
# #Get genotypes from every file (column 1 is marker names, 
# #every 5th column starting from 2 upto 7000)
# #replace "NC" (missing) with NA, 
# #replace heterozygotes "AB" with NA
# 
# system("gzcat CFF.txt.gz | cut -f1,$(seq 2 5 7000 | paste -d, -s -) | 
#        sed 's/.GType//g' |
#        sed 's/NC/NA/g' |
#        sed 's/AB/NA/g' > CFF.txt")
# 
# system("gzcat CFD.txt.gz | cut -f1,$(seq 2 5 7000 | paste -d, -s -) | 
#        sed 's/.GType//g' |
#        sed 's/NC/NA/g' |
#        sed 's/AB/NA/g' > CFD.txt")
# 
# system("gzcat Parental.txt.gz | cut -f1,$(seq 2 5 7000 | paste -d, -s -) | 
#        sed 's/.GType//g' |
#        sed 's/NC/NA/g' |
#        sed 's/AB/NA/g' > Parental.txt")
# 
# #packages needed
# install.packages(c("lme4", "synbreed", "lattice", "outliers", "wesanderson"))

#---------
#Phenotypes
#---------

#clear environment
rm(list = ls())

#set working directory
setwd("~/Desktop/maize-multiparent/data/")

# read phenotypic data
phenotypes <- read.table("PhenotypicDataDent.csv", header = TRUE, sep = ",")
str(phenotypes)

## 'data.frame':    4800 obs. of  14 variables:
##  $ Genotype   : Factor w/ 949 levels "A287","B73","CFD02-003",..: 705 288 838 441 48 470 1 901 204 856 ...
##  $ Population : Factor w/ 23 levels "B73","CFD02",..: 10 5 11 6 2 7 NA 12 4 12 ...
##  $ Pedigree   : Factor w/ 23 levels "B73","D06","D09",..: 15 10 16 11 7 12 NA 17 9 17 ...
##  $ Tester     : Factor w/ 1 level "UH007": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Plotcode   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ LOC        : Factor w/ 4 levels "INR","KWS","SYN",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ IncBlockNo.: Factor w/ 120 levels "B1","B10","B100",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Rep        : Factor w/ 8 levels "R1","R2","R3",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ DMY        : num  157 164 204 189 169 ...
##  $ DMC        : num  41.9 42.7 38.6 42.2 38.9 42.8 37.5 40.5 42.8 40.3 ...
##  $ PH         : num  257 250 242 261 258 ...
##  $ DtTAS      : int  86 84 86 85 87 86 89 84 83 86 ...
##  $ DtSILK     : int  84 84 85 86 87 86 90 85 84 86 ...
##  $ NBPL       : int  80 80 78 79 80 80 80 52 79 73 ...

#MO17 has a typo...should be Mo17 to be consistent with genotypes...correct this
levels(phenotypes$Genotype)[levels(phenotypes$Genotype)=="MO17"] <- "Mo17"
levels(phenotypes$Population)[levels(phenotypes$Population)=="MO17"] <- "Mo17"


#What the hiz heck is happening here:
#In every location, label plots with less than 70% of the median number of plants (NBPL) as missing data

#vector of locations
locations <- levels(phenotypes$LOC)

#loop over locations
for(i in 1:length(locations)){
  #subset the data for location i
  location.sub <- subset(phenotypes, phenotypes$LOC==locations[i])
  
  #calculate the median NBPL for location i and the 70% threshold
  location.nbpl.median <- median(na.omit(location.sub$NBPL))
  nbpl.thresh <- 0.7 * location.nbpl.median
  
  #For bad plots, replace phenotype values in original file with NA, retain values for good plots
  phenotypes[as.numeric(rownames(location.sub)),]$DMY <-
    ifelse(location.sub$NBPL < nbpl.thresh, NA, location.sub$DMY)
}

#------------
# Models and analyses
#------------

library(lme4)

## Warning: package 'lme4' was built under R version 3.2.3

## Loading required package: Matrix

#Model 1 from paper
model1 <- lmer(DMY ~ 
                 (1 | Genotype) +
                 (1 | Genotype:LOC) +
                 (1 | LOC/Rep/IncBlockNo.),
               data = phenotypes)

#Attach residuals and fitted model values to original data file
for(i in 1:nrow(phenotypes)){
  
  phenotypes$fitted.random[i]    <- round(fitted(model1)[as.character(i)], 2)
  phenotypes$residuals.random[i] <- round(residuals(model1)[as.character(i)], 2)
  
}

#Look at what it did...added residuals and fitted values to table
head(phenotypes)

##    Genotype Population     Pedigree Tester Plotcode LOC IncBlockNo. Rep
## 1 CFD10-089      CFD10 F353 x UH250  UH007        1 INR          B1  R1
## 2 CFD05-026      CFD05 F353 x EC169  UH007        2 INR          B1  R1
## 3 CFD11-337      CFD11 F353 x UH304  UH007        3 INR          B1  R1
## 4 CFD06-347      CFD06  F353 x F252  UH007        4 INR          B1  R1
## 5 CFD02-302      CFD02   F353 x B73  UH007        5 INR          B1  R1
## 6 CFD07-076      CFD07  F353 x F618  UH007        6 INR          B1  R1
##     DMY  DMC  PH DtTAS DtSILK NBPL fitted.random residuals.random
## 1 157.0 41.9 257    86     84   80        170.79           -13.79
## 2 164.5 42.7 250    84     84   80        169.60            -5.10
## 3 203.8 38.6 242    86     85   78        197.36             6.44
## 4 189.1 42.2 261    85     86   79        185.26             3.84
## 5 169.1 38.9 258    87     87   80        176.76            -7.66
## 6 183.8 42.8 248    86     86   80        185.91            -2.11

#Histogram of residuals: paper says there are outliers...can you see this?
hist(phenotypes$residuals.random)

library(outliers)
#The library outliers has a function that does a grubbs test and reports outliers. It only reports one outlier at a time. We can remove this outlier and repeat the test till there are no more outliers at a given alpha level.

#We can define a function that recursively finds outliers. 
#Once we find all the outliers, we can "flag" them in our original file
#skip to line 167 if this is boring...it should be ok to skip

grubbs.flag <- function(x, alpha) {
  #empty object to store outliers
  outliers <- NULL
  
  #dummy p-value to start loop
  pv <- alpha-1
  
  #loop over file till no more outliers are found (i.e. till p-value is less than alpha)
  while(pv < alpha) {
    #all observations that are not in the object outliers
    test <- na.omit(x[!x %in% outliers])
    
    #test these values and store the result of the test
    grubbs.result <- grubbs.test(test)
    
    #extract outlying observation and add it to "outliers" object
    outliers <- c(outliers,
                  round(as.numeric(strsplit(grubbs.result$alternative," ")[[1]][3]),2))
    
    #extract p-value (the iteration will break if this is greater than alpha)
    pv <- grubbs.result$p.value
  }
  
  #return a logical vector sepcifying whether observations in x are outliers or not
  return(x %in% outliers)
}

#Now use this function and flag outlying observations!
phenotypes$flag <- grubbs.flag(phenotypes$residuals, alpha = 0.05)

#Look at what it did 
head(phenotypes)

##    Genotype Population     Pedigree Tester Plotcode LOC IncBlockNo. Rep
## 1 CFD10-089      CFD10 F353 x UH250  UH007        1 INR          B1  R1
## 2 CFD05-026      CFD05 F353 x EC169  UH007        2 INR          B1  R1
## 3 CFD11-337      CFD11 F353 x UH304  UH007        3 INR          B1  R1
## 4 CFD06-347      CFD06  F353 x F252  UH007        4 INR          B1  R1
## 5 CFD02-302      CFD02   F353 x B73  UH007        5 INR          B1  R1
## 6 CFD07-076      CFD07  F353 x F618  UH007        6 INR          B1  R1
##     DMY  DMC  PH DtTAS DtSILK NBPL fitted.random residuals.random  flag
## 1 157.0 41.9 257    86     84   80        170.79           -13.79 FALSE
## 2 164.5 42.7 250    84     84   80        169.60            -5.10 FALSE
## 3 203.8 38.6 242    86     85   78        197.36             6.44 FALSE
## 4 189.1 42.2 261    85     86   79        185.26             3.84 FALSE
## 5 169.1 38.9 258    87     87   80        176.76            -7.66 FALSE
## 6 183.8 42.8 248    86     86   80        185.91            -2.11 FALSE

#Now extract good plots and adjust rownames of new subset 
phenotypes.grubbed <- subset(phenotypes, phenotypes$flag==FALSE)
rownames(phenotypes.grubbed) <- seq(1, nrow(phenotypes.grubbed),1)

#Model 2 from paper
model2 <- lmer(DMY ~ 
                 (1 | Population) +
                 (1 | Population:Genotype) +
                 (1 | Population:Genotype:LOC) +
                 (1 | LOC/Rep/IncBlockNo.),
               data = phenotypes.grubbed)

#Same as before, attach residuals and fitted model values
for(i in 1:nrow(phenotypes.grubbed)){
  
  phenotypes.grubbed$fitted.random.grubbed[i]    <- fitted(model2)[as.character(i)]
  phenotypes.grubbed$residuals.random.grubbed[i] <- residuals(model2)[as.character(i)]
  
}

#------------
# Why "Grub"?
#------------

#I don't particularly like that they did this but the motivation is probably to have residuals meet assumptions. The following plots should maybe make this clear...

#View plots two at a time
par(mfrow=c(1,2)) 

#Histograms of residuals from the two models
hist(phenotypes$residuals.random, xlab = "Residuals", main = "Ungrubbed")
hist(phenotypes.grubbed$residuals.random.grubbed, xlab = "Residuals", main = "Grubbed")

#Observed Versus fitted model values
plot(DMY ~ fitted.random, data = phenotypes, xlab = "Fitted", main = "Ungrubbed")
plot(DMY ~ fitted.random.grubbed, data = phenotypes.grubbed, xlab = "Fitted", main = "Grubbed")

#Residuals Versus fitted model values
plot(residuals.random ~ fitted.random, data = phenotypes, 
     ylab = "Residuals", xlab = "Fitted", main = "Ungrubbed")
plot(residuals.random.grubbed ~ fitted.random.grubbed, data = phenotypes.grubbed, 
     ylab = "Residuals", xlab = "Fitted", main = "Grubbed")

#------------
#Calculate adjusted means for trait values using fixed effect for genotypes
#------------

#Model 3 from paper
model3 <- lmer(DMY ~ 
                 Genotype +
                 (1 | Genotype:LOC) +
                 (1 | LOC/Rep/IncBlockNo.),
               data = phenotypes.grubbed)

#Same as before, attach residuals and fitted model values
for(i in 1:nrow(phenotypes.grubbed)){
  
  phenotypes.grubbed$fitted.fixed[i]    <- fitted(model3)[as.character(i)]
  phenotypes.grubbed$residuals.fixed[i] <- residuals(model3)[as.character(i)]
  
}

#Take a look at the fixed effects
head(fixef(model3))

##       (Intercept)       GenotypeB73 GenotypeCFD02-003 GenotypeCFD02-006 
##         219.42642         -34.07804         -18.47939         -27.41759 
## GenotypeCFD02-010 GenotypeCFD02-024 
##         -11.58803         -24.49133

#The overall intercept defines the overall mean...extract this from the fixed effects
overall.mean     <- fixef(model3)["(Intercept)"]

#The remainder of the fixed effects are the genotype effects
genotype.effects <- fixef(model3)[-1] 

#The adjusted means...rounded to two decimal places
adjusted.means   <- round(overall.mean + genotype.effects,2)

#Take a look at this vector
head(adjusted.means)

##       GenotypeB73 GenotypeCFD02-003 GenotypeCFD02-006 GenotypeCFD02-010 
##            185.35            200.95            192.01            207.84 
## GenotypeCFD02-024 GenotypeCFD02-027 
##            194.94            207.84

#The names look dirty...lets remove the "Genotype" string from them
#This will help downstream steps 
#Ignore the code here unless of interest...skip to 265
for (i in 1:length(names(adjusted.means))){
  names(adjusted.means)[i] <- strsplit(names(adjusted.means), split = "Genotype")[[i]][2]
}
adjusted.means <- data.frame(adjusted.means)
colnames(adjusted.means) <- c("Adjusted.Means")

#Cleaned Means!
head(adjusted.means)

##           Adjusted.Means
## B73               185.35
## CFD02-003         200.95
## CFD02-006         192.01
## CFD02-010         207.84
## CFD02-024         194.94
## CFD02-027         207.84

#------------
#Relate genotype records with phenotype records
#------------

#Read DH genotypes (rows are markers) and transpose this (columns are markers)
dh.genotypes <- t(read.table("CFD.txt", header = T, sep = "\t", 
                        check.names = FALSE, row.names = 1))

#Read Parental genotypes and tranpose
parental.genotypes <- t(read.table("Parental.txt", header = T, sep = "\t", 
                        check.names = FALSE, row.names = 1))

#Combine both tables
all.genotypes <- rbind(dh.genotypes, parental.genotypes)

#Get family info for each genotype...one record per genotype
#Recall our phenotype file has this in the first two columns
family.info <- data.frame(family = phenotypes$Population, genotype = phenotypes$Genotype)
family.info <- family.info[!duplicated(family.info),]
rownames(family.info) <- family.info$geno
family.info <- data.frame(family.info)
head(family.info)

##           family  genotype
## CFD10-089  CFD10 CFD10-089
## CFD05-026  CFD05 CFD05-026
## CFD11-337  CFD11 CFD11-337
## CFD06-347  CFD06 CFD06-347
## CFD02-302  CFD02 CFD02-302
## CFD07-076  CFD07 CFD07-076

#Number of individuals phenotyped
nrow(family.info)

## [1] 949

#Number of genotypic records
nrow(all.genotypes)

## [1] 1028

#So it looks like not all phenotyped individuals are genotyped and vice versa.
#This is where things get interesting and all our rownames efforts are going to pay off...

#First let's recode the marker data to a 0, 1, 2 scale
#The in the paper is coding by the allele of the central line (common to all families). So a score of 0 would mean homozygous for the allele that is NOT the F353 allele..We can do this coding with synbreed

library(synbreed)

## Warning: package 'synbreed' was built under R version 3.2.3

#make an object relating all 3 kinds of records
gp.object <- create.gpData(pheno = adjusted.means, 
                           geno = all.genotypes,
                           family = family.info)

#The nice thing about this object is it tells us the record status of each individual
head(gp.object$covar)

##          id phenotyped genotyped family genotype
## 1      A287      FALSE     FALSE   <NA>     A287
## 2       B73       TRUE      TRUE    B73      B73
## 3 CFD01-003      FALSE      TRUE   <NA>     <NA>
## 4 CFD01-006      FALSE      TRUE   <NA>     <NA>
## 5 CFD01-008      FALSE      TRUE   <NA>     <NA>
## 6 CFD01-009      FALSE      TRUE   <NA>     <NA>

gp.object$covar$genotype <- NULL #we dont need that last column...ignore

#Code marker data, filter for minor allele frequency, missing data. All parameters as in paper except imputation...The paper imputes using family and mapping info...I got lazy about getting the mapping info so imputing randomly. I added family info to the gpObject but will need to subset the data to make sure we only have individuals with family information if we were to do family based imputations.  If you do this please share it with me so I can update...

#Since we are coding based on F353 alleles and doing random imputations...we can't have missing-ness for F353 so remove markers that are missing in F353
gp.object.copy <- gp.object
gp.object.copy$geno <- gp.object$geno[,!is.na(gp.object$geno["F353",])]

#Code marker matrix...label.heter is NULL because we can't have heterozygous calls
genotypes.filtered.f353 <- codeGeno(gp.object.copy,
                               impute = TRUE, 
                               impute.type = "random",
                               label.heter = NULL,
                               maf = 0.01,
                               nmiss = 0.1,
                               reference.allele = as.vector(gp.object.copy$geno["F353",]))

## 
##      Summary of imputation 
##     total number of missing values                : 357289 
##     number of random imputations                  : 357289

#We can see how many markers got filtered out for being crappy
c(BeforeFiltering=ncol(all.genotypes),
  AfterFiltering=ncol(genotypes.filtered.f353$geno))

## BeforeFiltering  AfterFiltering 
##           56110           42900

#Additionally, since we coded by dosage of the central parent, we can check that coding worked by looking at the frequncies in the overall set, we expect the frequency of the homozygous central parent state (i.e. 2) to be much higher than the alternate since the homozygous line is common to all DH's
summary(genotypes.filtered.f353)

## object of class 'gpData' 
## covar 
##   No. of individuals 1119 
##           phenotyped 947 
##            genotyped 1028 
## pheno 
##   No. of traits:        1 
## 
##  Adjusted.Means 
##  Min.   :120.7  
##  1st Qu.:179.9  
##  Median :188.4  
##  Mean   :187.5  
##  3rd Qu.:196.1  
##  Max.   :218.3  
## 
## geno 
##   No. of markers 42900 
##   genotypes 0 2 
##   frequencies 0.172 0.828 
##   NA's 0.000 %
## map 
##   No. of mapped markers  
##   No. of chromosomes     0 
## 
##   markers per chromosome 
##  NULL
## 
## pedigree 
## NULL

#Again it is apparent that not genotyped inds are not phenotyped and vice versa. synbreed has a convenient function that gets records that have both as a data frame
all.data <- gpData2data.frame(genotypes.filtered.f353)

#Look at the data
all.data[1:6,1:6]

##          ID Adjusted.Means abph1.15 abph1.22 ae1.4 ae1.8
## 1       B73         185.35        0        2     2     0
## 2 CFD02-003         200.95        2        2     2     2
## 3 CFD02-006         192.01        2        2     2     0
## 4 CFD02-010         207.84        2        2     2     0
## 5 CFD02-024         194.94        2        2     2     0
## 6 CFD02-027         207.84        0        2     2     2

#----
# G-matrices
#----

#Lets define a function to make a G-matrix given a marker matrix using the textbook formula
make.G.matrix <- function(M){
  
#Every column is a marker...summing up the column gives allele counts
#Number of individuals == number of rows in marker matrix == 1/2 Number of chromosomes 
#Dividing total count by the total number of chromosomes gives frequencies

  #Vector of reference allele frequencies
  P <- colSums(M) / (2*nrow(M))
  
  #Center marker matrix
  for(i in 1:nrow(M)){
  
  M[i,] <- M[i,] - 2*P
  
  }
  
  Z <- M
  
  #scaling factor for G-matrix
  K <- 2 * sum(P * (1 - P)) * 2 #(DH)

  #G-matrix
  G.matrix <- (Z %*% t(Z) / K)
  
  return(G.matrix)
}

#----
# G-matrix based relationships (ALL DATA...this isn't in the paper)
#----
 
#Get marker matrix...minor allele coded...from previous steps
M <-genotypes.filtered.f353$geno

G.matrix <- make.G.matrix(M)

# This matrix now has relationships between all Individuals. Every Bi-parental family is represented by over 50 DH's so if we plot it out, it's kind of hard to see their relationship with the inbred parents which are represented by one genotype each. However you can definitely make out the DH families. I will augment this G-matrix with replicated row entries for the parents so we can visualize this better.

#get names of parents
parents <- rownames(G.matrix)[grep("CFD", rownames(G.matrix), invert = TRUE)]

#replicate every parent 100 times
for(i in 1:length(parents)){
  G.matrix <- rbind(G.matrix,
                    G.matrix[rep(parents[i], 100), 1:ncol(G.matrix)])
}

#Plot fake/augmented G.Matrix
library(lattice) 
library(wesanderson)
levelplot(G.matrix[1:nrow(G.matrix),ncol(G.matrix):1],
          xlab = NULL, ylab = NULL, scales=list(draw=FALSE),
          col.regions = wes_palette("Zissou", 30, type = "continuous"), 
          main = "`Careful when reading` Relationship Matrix")

#The first third to the left is the DHs followed by the parents. Can you tell which DH came from which parent? Can you tell which one F353 (the central line) is?

#---
#Understanding this matrix
#---

#The genotypes are coded by 0,1,2 by the F353 allele (so homozygous for F353 allele) is 2). This variance terms in this matrix are tracking homozygosity within an inbred or DH. The covariance terms tracking shared alleles between combinations thereof...So why can't we make out the central line if it shares something with all families?...I think because the matrix is adjusted ...by the P vector (see the function)...for the F353 allele which also happens to be the major allele in the overall data. Try commenting out the centering for loop in the function and re-run. See what happens...make sure you "fix" the function back because we need it down-stream. The central line should become clear as in link...

#----
#G-BLUP model
#----

#For the model, we are only interested in the DH families, so lets pull these out of the overall data
prediction.data <- all.data[grep("CFD", all.data$ID),]

#Number of DH lines
nrow(prediction.data)

## [1] 846

#Glance data
prediction.data[1:6,1:6]

##          ID Adjusted.Means abph1.15 abph1.22 ae1.4 ae1.8
## 2 CFD02-003         200.95        2        2     2     2
## 3 CFD02-006         192.01        2        2     2     0
## 4 CFD02-010         207.84        2        2     2     0
## 5 CFD02-024         194.94        2        2     2     0
## 6 CFD02-027         207.84        0        2     2     2
## 7 CFD02-036         195.82        2        2     2     2

#Lets re-create and attach the family information
family.names <- c()
for(i in 1:nrow(prediction.data)){
  
  f.name <- strsplit(prediction.data$ID[i], split = "-")[[1]][1]
  family.names <- na.omit(c(family.names, f.name))
  
}

prediction.data <- data.frame(cbind(family=family.names, prediction.data))

#Glance data
prediction.data[1:6,1:6]

##   family        ID Adjusted.Means abph1.15 abph1.22 ae1.4
## 2  CFD02 CFD02-003         200.95        2        2     2
## 3  CFD02 CFD02-006         192.01        2        2     2
## 4  CFD02 CFD02-010         207.84        2        2     2
## 5  CFD02 CFD02-024         194.94        2        2     2
## 6  CFD02 CFD02-027         207.84        0        2     2
## 7  CFD02 CFD02-036         195.82        2        2     2

#----
# G-matrix (DHs only)
#----

#marker matrix
M <- as.matrix(prediction.data[, 4:ncol(prediction.data)])
rownames(M) <- prediction.data$ID

#DH G.Matrix
G.matrix <- make.G.matrix(M)

#Design Matrix for fixed effects
#This matrix assigns every genotype to a family
#Following is one cheap way to make this on R
#Model Phenotype regressed on family without an intercept
model.design <- prediction.data$Adjusted.Means ~ 0 + prediction.data$family
m <- model.frame(model.design, prediction.data)
X <- model.matrix(model.design, m)
rownames(X) <- prediction.data$ID

#Glace design matrix
X[1:6,1:3]

##           prediction.data$familyCFD02 prediction.data$familyCFD03
## CFD02-003                           1                           0
## CFD02-006                           1                           0
## CFD02-010                           1                           0
## CFD02-024                           1                           0
## CFD02-027                           1                           0
## CFD02-036                           1                           0
##           prediction.data$familyCFD04
## CFD02-003                           0
## CFD02-006                           0
## CFD02-010                           0
## CFD02-024                           0
## CFD02-027                           0
## CFD02-036                           0

#What happened here? We fit a model that made such a matrix and then we 
#pulled that matrix. Lets fix the column names

#Fix column names
for (i in 1:length(colnames(X))){
  colnames(X)[i] <- strsplit(colnames(X), split = "family")[[i]][2]
}

#Glance design matrix
X[1:2,1:3]

##           CFD02 CFD03 CFD04
## CFD02-003     1     0     0
## CFD02-006     1     0     0

#Model elements
X <- X
G <- G.matrix
y    <- as.vector(prediction.data$Adjusted.Means)

c <- ncol(X) #This is how many families/parents we have
beta <- vector(length = c) 

n <- nrow(prediction.data) 
I    <- diag(n) 
Z    <- diag(n) #Design matrix for random effects

#Equation 1
V = Z %*% G %*% t(Z) + I  

#Equation 2
beta_hat = solve(t(X) %*% solve(V) %*% X) %*% t(X) %*% solve(V) %*% y

#Equation 3
u_hat = G %*% t(Z) %*% solve(V) %*% (y - (X %*% beta_hat))
true.bvs <- u_hat

# blup <- X
# 
# for(i in 1:ncol(blup)){
#   
#   blup[,i] <- blup[,i] * beta_hat[i]
#   
# }
# 
# blup <- rowSums(blup)
# 
# true.blup <- blup + u_hat

#What happened here?
#We used the standard BLUP machinery to estimate breeding values for all individuals using all phenotypic and genotypic records. These estimates will be our "gold-standard". Now, we will hide away parts of the phenotypic records as missing and predict breeding values for these missing ones. Then we will compare these back to our "gold-standard".

#---
# Cross-validation (k-fold)
#---

#Divide data into k sets, train on k-1 sets, predict kth set

k <- 3  #How many folds               
CVcycles <- 100 #How many cycles
est.bvs.overall <- c()  #empty object to store results

#----
# G-matrix for total set (same as before)
#----
M <- as.matrix(prediction.data[, 4:ncol(prediction.data)])
rownames(M) <- prediction.data$ID
G.matrix <- make.G.matrix(M)

for (CVcycle in 1:CVcycles) {
  
#Total set
total.set <- prediction.data
rownames(total.set) <- seq(1, nrow(total.set), 1)

#Set to sample from
sample.set <- rownames(total.set)                          
estimation.set.size <- round(nrow(prediction.data)/k,0)

#Sample row names (these will be rows we hide phenotypes for)
#i.e. estimation set
estimation.set.index <- sample(sample.set, 
                               size = estimation.set.size,
                               replace = FALSE)

#genotypes/lines that make the estimation set
estimation.set <- total.set[estimation.set.index,]$ID

#training set (make estimation set phenotypes "missing")
training.set <-  total.set
training.set$Adjusted.Means[as.numeric(estimation.set.index)] <- NA

#Loop bottleneck (taken out of loop)
# #----
# # G-matrix for total set (same as before)
# #----
# M <- as.matrix(training.set[, 4:ncol(training.set)])
# rownames(M) <- training.set$ID
# G.matrix <- make.G.matrix(M)

#Design matrix for fixed effects...same as before with some modifications
model.design <- training.set$ID ~ 0 + training.set$family
m <- model.frame(model.design, training.set)
X <- model.matrix(model.design, m)
rownames(X) <- training.set$ID

for (i in 1:length(colnames(X))){
  colnames(X)[i] <- strsplit(colnames(X), split = "family")[[i]][2]
}

#We dont know fixed effects for estimation set so set to 0 and remove rows correspoding to the estimation set from design matrix
X[as.character(estimation.set), ] <- 0
X <- X[!rownames(X) %in% estimation.set,]

#Same as before with modifications as in X
Z    <- diag(nrow = nrow(training.set)) 
colnames(Z) <- rownames(Z)    <- rownames(G.matrix)
Z[as.character(estimation.set), ] <- 0
Z <- Z[!rownames(Z) %in% estimation.set,]

#Same as before with modifications as in X
I    <- diag(nrow = nrow(training.set)) 
colnames(I) <- rownames(I)    <- rownames(G.matrix)
I <- I[!rownames(I) %in% estimation.set,!colnames(I) %in% estimation.set]

#extract records with all pheno/geno information
training.set.sub <- training.set[!training.set$ID %in% estimation.set, ]

#Model Elements
#same as before with dimensional modifications to adjust for missing-ness
X <- X
G <- G.matrix
I    <- I
Z    <- Z
y    <- as.vector(training.set.sub$Adjusted.Means)
names(y) <- as.character(training.set.sub$ID)
c <- ncol(X) #This is how many families/parents we have
beta <- vector(length = c) 
n <- nrow(training.set.sub)

#Equation 1
V = Z %*% G %*% t(Z) + I  

#Equation 2
beta_hat = solve(t(X) %*% solve(V) %*% X) %*% t(X) %*% solve(V) %*% y

#Equation 3
u_hat = G %*% t(Z) %*% solve(V) %*% (y - (X %*% beta_hat))

est.bvs <- u_hat

#Combine estimated set with true set
est.bvs <- cbind(est.bvs, true.bvs)

#Extract only the ones that got estimated...drop training set
est.bvs <- est.bvs[estimation.set,]

est.bvs.overall <- rbind(est.bvs.overall, est.bvs)  

par(mfrow=c(1,1))
#plot(est.bvs.overall, xlab = "Estimated Breeding Value", ylab = "True Breeding Value")
}

#Plot results (un-comment the plot in line 631 and re-run loop for "magic")
plot(est.bvs.overall, xlab = "Estimated Breeding Value", ylab = "True Breeding Value")

#Accuracy
a <- lm(est.bvs.overall[,1] ~ est.bvs.overall[,2])
paste("accuracy was", round(summary(a)$adj.r.squared,2))

## [1] "accuracy was 0.78"

#---
# Cross-validation (family-fold)
#---

#Divide data by families, train on all families except 1, predict the 1

families <- levels(prediction.data$family) #family names
CVcycles <- 100 #How many cycles
est.bvs.overall <- c()  #empty object to store results

#----
# G-matrix for total set (same as before)
#----
M <- as.matrix(prediction.data[, 4:ncol(prediction.data)])
rownames(M) <- prediction.data$ID
G.matrix <- make.G.matrix(M)

for (CVcycle in 1:CVcycles) {
  
#Total set
total.set <- prediction.data
rownames(total.set) <- seq(1, nrow(total.set), 1)

#Set to sample from
sample.set <- families

#Sample row names (these will be rows we hide phenotypes for)
#i.e. estimation set
estimation.set.index <- sample(sample.set, 
                               size = 1,
                               replace = FALSE)

#genotypes/lines that make the estimation set
estimation.set <- subset(total.set$ID, total.set$family==estimation.set.index)

#training set (make estimation set phenotypes "missing")
training.set <-  total.set
for(i in 1:length(training.set$Adjusted.Means)){
  
  training.set$Adjusted.Means[i] <- 
  
  ifelse(training.set$family[i] == estimation.set.index, NA, 
         training.set$Adjusted.Means[i])
  
}

#Loop bottleneck (taken out of loop)
# #----
# # G-matrix for total set (same as before)
# #----
# M <- as.matrix(training.set[, 4:ncol(training.set)])
# rownames(M) <- training.set$ID
# G.matrix <- make.G.matrix(M)

#Design matrix for fixed effects...same as before with some modifications
model.design <- training.set$ID ~ 0 + training.set$family
m <- model.frame(model.design, training.set)
X <- model.matrix(model.design, m)
rownames(X) <- training.set$ID

for (i in 1:length(colnames(X))){
  colnames(X)[i] <- strsplit(colnames(X), split = "family")[[i]][2]
}

#We dont know fixed effects for estimation set so set to 0 and remove rows correspoding to the estimation set from design matrix
X[as.character(estimation.set), ] <- 0
X <- X[!rownames(X) %in% estimation.set,]
X <- X[,!colnames(X) %in% estimation.set.index]

#Same as before with modifications as in X
Z    <- diag(nrow = nrow(training.set)) 
colnames(Z) <- rownames(Z)    <- rownames(G.matrix)
Z[as.character(estimation.set), ] <- 0
Z <- Z[!rownames(Z) %in% estimation.set,]
Z <- Z[,!colnames(Z) %in% estimation.set.index]

#Same as before with modifications as in X
I    <- diag(nrow = nrow(training.set)) 
colnames(I) <- rownames(I)    <- rownames(G.matrix)
I <- I[!rownames(I) %in% estimation.set,!colnames(I) %in% estimation.set]

#extract records with all pheno/geno information
training.set.sub <- training.set[!training.set$ID %in% estimation.set, ]

#Model Elements
#same as before with dimensional modifications to adjust for missing-ness
X <- X
G <- G.matrix
I    <- I
Z    <- Z
y    <- as.vector(training.set.sub$Adjusted.Means)
names(y) <- as.character(training.set.sub$ID)
c <- ncol(X) #This is how many families/parents we have
beta <- vector(length = c) 
n <- nrow(training.set.sub)

#Equation 1
V = Z %*% G %*% t(Z) + I  

#Equation 2
beta_hat = solve(t(X) %*% solve(V) %*% X) %*% t(X) %*% solve(V) %*% y

#Equation 3
u_hat = G %*% t(Z) %*% solve(V) %*% (y - (X %*% beta_hat))

est.bvs <- u_hat

#Combine estimated set with true set
est.bvs <- cbind(est.bvs, true.bvs)

#Extract only the ones that got estimated...drop training set
est.bvs <- est.bvs[estimation.set,]

est.bvs.overall <- rbind(est.bvs.overall, est.bvs)  

par(mfrow=c(1,1))
#plot(est.bvs.overall, xlab = "Estimated Breeding Value", ylab = "True Breeding Value")
}

#Plot results (un-comment the plot in line 631 and re-run loop for "magic")
plot(est.bvs.overall, xlab = "Estimated Breeding Value", ylab = "True Breeding Value")

#Accuracy
a <- lm(est.bvs.overall[,1] ~ est.bvs.overall[,2])
paste("accuracy was", round(summary(a)$adj.r.squared,2))

## [1] "accuracy was 0.43"

Genomic Prediction example using `Usefulness of Multiparental Populations of Maize...`

Gitanshu Munjal

March 10, 2016