## ----setup, include=FALSE-----------------------------------------------------
library(evola)

## -----------------------------------------------------------------------------
set.seed(1)
# Data
Gems <- data.frame(
  Color = c("Red", "Blue", "Purple", "Orange",
            "Green", "Pink", "White", "Black", 
            "Yellow"),
  Weight = round(runif(9,0.5,5),2),
  Value = round(abs(rnorm(9,0,5))+0.5,2)
)
head(Gems)

## -----------------------------------------------------------------------------
# Task: Gem selection. 
# Aim: Get highest combined value.
# Restriction: Max weight of the gem combined = 10. 
res0<-evolafit(cbind(Weight,Value)~Color, dt= Gems,
               # constraints: if greater than this ignore
               constraintsUB = c(10,Inf), 
               # constraints: if smaller than this ignore
               constraintsLB= c(-Inf,-Inf), 
               # weight the traits for the selection
               traitWeight = c(0,1), 
               # population parameters
               nCrosses = 100, nProgeny = 20, recombGens = 1, 
               # coancestry parameters
               A=NULL, lambda=0, nQTLperInd = 1, 
               # selection parameters
               propSelBetween = .9, propSelWithin =0.9, 
               nGenerations = 15, verbose = FALSE
) 
pmonitor(res0)

## -----------------------------------------------------------------------------
# index for the best solution for trait Value
best=bestSol(res0)["pop","Value"]; best 
# actual solution
res0$M[best,] 
# value and weight for the selected solution 
xa = res0$M[best,] %*% as.matrix(Gems[,c("Weight","Value")]); xa

## ----fig.show='hold'----------------------------------------------------------
data(DT_cpdata)
DT <- DT_cpdata
head(DT)

## ----fig.show='hold'----------------------------------------------------------
# get best 20 individuals weighting variance by 0.5
res<-evolafit(cbind(Yield, occ)~id, dt= DT, 
              # constraints: if sum is greater than this ignore 
              constraintsUB = c(Inf,20), 
              # constraints: if sum is smaller than this ignore
              constraintsLB= c(-Inf,-Inf), 
              # weight the traits for the selection
              traitWeight = c(1,0), 
              # population parameters
              nCrosses = 100, nProgeny = 10, 
              # coancestry parameters
              A=A, lambda= (30*pi)/180 , nQTLperInd = 2, 
              # selection parameters
              propSelBetween = 0.5, propSelWithin =0.5, 
              nGenerations = 15, verbose=FALSE) 

## ----fig.show='hold'----------------------------------------------------------
best = bestSol(res)["pop","Yield"];
sum(res$M[best,]) # total # of inds selected

## ----fig.show='hold'----------------------------------------------------------
pmonitor(res)
plot(DT$Yield, col=as.factor(res$M[best,]), 
     pch=(res$M[best,]*19)+1)


## -----------------------------------------------------------------------------
data(DT_technow)
DT <- DT_technow
DT$occ <- 1; DT$occ[1]=0
M <- M_technow
A <- A.mat(M)
head(DT)

## -----------------------------------------------------------------------------
# silent for CRAN checks restriction on vignettes time
# run the genetic algorithm
#   res<-evolafit(formula = c(GY, occ)~hy, dt= DT, 
#                 # constraints: if sum is greater than this ignore
#                 constraintsUB = c(Inf,100), 
#                 # constraints: if sum is smaller than this ignore
#                 constraintsLB= c(-Inf,-Inf),
#                 # weight the traits for the selection
#                 traitWeight = c(1,0), 
#                 # population parameters
#                 nCrosses = 100, nProgeny = 10, 
#                 # coancestry parameters
#                 A=A, lambda= (20*pi)/180 , nQTLperInd = 100, 
#                 # selection parameters
#                 propSelBetween = 0.5, propSelWithin =0.5, 
#                 nGenerations = 15, verbose=FALSE) 
# best = bestSol(res)["pop","GY"]
# sum(res$M[best,]) # total # of inds selected

## -----------------------------------------------------------------------------
# pmonitor(res)
# plot(DT$GY, col=as.factor(res$M[best,]), 
#        pch=(res$M[best,]*19)+1)

## -----------------------------------------------------------------------------
data(DT_wheat)
DT <- as.data.frame(DT_wheat)
DT$id <- rownames(DT) # IDs
DT$occ <- 1; DT$occ[1]=0 # to track occurrences
DT$dummy <- 1; DT$dummy[1]=0 # dummy trait
# if genomic
# GT <- GT_wheat + 1; rownames(GT) <- rownames(DT)
# A <-  GT%*%t(GT)
# A <- A/mean(diag(A))
# if pedigree
A <- A_wheat

## -----------------------------------------------------------------------------
##Perform eigenvalue decomposition for clustering
##And select cluster 5 as target set to predict
pcNum=25
svdWheat <- svd(A, nu = pcNum, nv = pcNum)
PCWheat <- A %*% svdWheat$v
rownames(PCWheat) <- rownames(A)
DistWheat <- dist(PCWheat)
TreeWheat <- cutree(hclust(DistWheat), k = 5 )
plot(PCWheat[,1], PCWheat[,2], col = TreeWheat, 
     pch = as.character(TreeWheat), xlab = "pc1", ylab = "pc2")
vp <- rownames(PCWheat)[TreeWheat == 3]; length(vp)
tp <- setdiff(rownames(PCWheat),vp)

## -----------------------------------------------------------------------------
As <- A[tp,tp]
DT2 <- DT[rownames(As),]

## -----------------------------------------------------------------------------
res<-evolafit(cbind(dummy, occ)~id, dt= DT2, 
                # constraints: if sum is greater than this ignore 
                constraintsUB = c(Inf, 100), 
                # constraints: if sum is smaller than this ignore
                constraintsLB= c(-Inf, -Inf), 
                # weight the traits for the selection
                traitWeight = c(1,0), 
                # population parameters
                nCrosses = 100, nProgeny = 10, 
                # coancestry parameters
                A=As,
                lambda=(60*pi)/180, nQTLperInd = 80, 
                # selection parameters
                propSelBetween = 0.5, propSelWithin =0.5, 
                nGenerations = 15, verbose = FALSE)

best = bestSol(res)["pop","dummy"]
sum(res$M[best,]) # total # of inds selected

## -----------------------------------------------------------------------------
cex <- rep(0.5,nrow(PCWheat))
names(cex) <- rownames(PCWheat)
cex[names(which(res$M[best,]==1))]=2
plot(PCWheat[,1], PCWheat[,2], col = TreeWheat, cex=cex,
     pch = TreeWheat, xlab = "pc1", ylab = "pc2")

## -----------------------------------------------------------------------------
DT2$cov <- apply(A[tp,vp],1,mean)

## -----------------------------------------------------------------------------
res<-evolafit(cbind(cov, occ)~id, dt= DT2, 
                # constraints: if sum is greater than this ignore 
                constraintsUB = c(Inf, 100), 
                # constraints: if sum is smaller than this ignore
                constraintsLB= c(-Inf, -Inf), 
                # weight the traits for the selection
                traitWeight = c(1,0), 
                # population parameters
                nCrosses = 100, nProgeny = 10, 
                # coancestry parameters
                A=As,
                lambda=(60*pi)/180, nQTLperInd = 80, 
                # selection parameters
                propSelBetween = 0.5, propSelWithin =0.5, 
                nGenerations = 15, verbose = FALSE)
best = bestSol(res)["pop","cov"]
sum(res$M[best,]) # total # of inds selected

## -----------------------------------------------------------------------------
cex <- rep(0.5,nrow(PCWheat))
names(cex) <- rownames(PCWheat)
cex[names(which(res$M[best,]==1))]=2
plot(PCWheat[,1], PCWheat[,2], col = TreeWheat, cex=cex,
     pch = TreeWheat, xlab = "pc1", ylab = "pc2")

## -----------------------------------------------------------------------------
data(DT_technow)
DT <- DT_technow
DT$occ <- 1; DT$occ[1]=0
M <- M_technow
A <- A.mat(M)

Z=with(DT,overlay(dent,flint) )#  Matrix::sparse.model.matrix(~dent-1, data=DT)
rownames(Z) <- DT$hy # needed to link to the QTL matrix

## -----------------------------------------------------------------------------
# regular fitness function
fitnessf <-function (Y, b, d, Q, Z) {
  fit <- Y %*% b - d
  return(fit)
}
# new fitness function with constraint
fitnessf <-function (Y, b, d, Q, Z) {
  X=Q%*%Z[colnames(Q),]
  bad <- as.vector( apply(X,1, function(x){length(which(x > 5))}) ) 
  bad <- which(bad > 0)
  fit <- Y %*% b - d
  if(length(bad) > 0){fit[bad,1]=min(fit[,1])}
  return(fit)
}

## -----------------------------------------------------------------------------
# silent for CRAN checks restriction on time
# res<-evolafit(formula = c(GY, occ)~hy,
#               dt= DT, 
#               # constraints: if sum is greater than this ignore
#               constraintsUB = c(Inf,50), 
#               # constraints: if sum is smaller than this ignore
#               constraintsLB= c(-Inf,-Inf),
#               # weight the traits for the selection
#               traitWeight = c(1,0), 
#               # population parameters
#               nCrosses = 100, nProgeny = 10, 
#               # coancestry parameters
#               A=A, lambda= (10*pi)/180 , nQTLperInd = 40, 
#               # new fitness function and additional args
#               fitnessf = fitnessf, Z=Z,
#               # selection parameters
#               propSelBetween = 0.5, propSelWithin =0.5, 
#               nGenerations = 15, verbose=FALSE) 
# 
# best = bestSol(res)["pop","GY"]
# xa = (res$M %*% DT$GY)[best,]; xa 
# xAx = res$M[best,] %*% A %*% res$M[best,]; xAx 
# sum(res$M[best,]) # total # of inds selected

## -----------------------------------------------------------------------------
# # check how many times an individual was used in the final crosses
# crosses <- data.frame(cross=names(which( res$M[best,] == 1)))
# table(unlist(strsplit(crosses$cross,":")))
# # check performance of crosses selected
# plot(DT$GY, col=as.factor(res$M[best,]), 
#        pch=(res$M[best,]*19)+1)

## -----------------------------------------------------------------------------
data("mtcars")
mtcars <- as.data.frame(apply(mtcars,2,scale))
mtcars$inter <- 1
# head(mtcars)
# relationship between the 2 variables
# plot(mpg~hp, data=mtcars)
mod <- lm(mpg~hp, data=mtcars);mod

# create initial QTL effects
a <- seq(-1,1,.1);a
dt <- as.data.frame(expand.grid(a,a))
colnames(dt) <- paste0("alpha",1:ncol(dt))
dt$qtl=paste0("Q",1:nrow(dt))
dt$inter <- rnorm(nrow(dt))
head(dt)

# create n samples equivalent to the number of progeny
# you are planning to simulate (e.g., 1000)
sam <- sample(1:nrow(mtcars),500,replace = TRUE)
y <- mtcars$mpg[sam]
one <- rep(1,length(y))
x <- mtcars$hp[sam]
x2 <- mtcars$hp[sam]^2
X <- cbind(one,x)
plot(x,y)
# Task: linear regression
res0<-evolafit(formula=cbind(inter,alpha1)~qtl, dt= dt,
               # constraints: if greater than this ignore
               constraintsUB = c(Inf,Inf), 
               # constraints: if smaller than this ignore
               constraintsLB= c(-Inf,-Inf), 
               # weight the traits for the selection
               traitWeight = c(1,1), 
               # population parameters
               nCrosses = 50, nProgeny = 10, recombGens = 1, 
               # coancestry parameters
               A=NULL, lambda=0, nQTLperInd = 1, 
               # least MSE function (y - Xb)^2
               fitnessf=function(Y,b,d,Q,x,y){ apply(( (y%*%Jc(500)) - ( X%*%t(Y)) )^2,2,sum) },
               # selection parameters
               propSelBetween = 0.5, propSelWithin =0.5, selectTop=FALSE,
               nGenerations = 15, y=y, x=x, verbose = FALSE
) 

# develop a joint fitness function that uses all traits

pmonitor(res0)
# this time the best solution is the one that minimizes the error
error = ( stan(y) - apply( X*res0$pheno,1,sum ) )^2
best=which(error == min(error))[1]
xa=res0$M[best,] %*% as.matrix(dt[,c("inter","alpha1")]); xa

plot( as.matrix(mtcars[,c("inter","hp")]) %*% t(xa)  , mtcars$mpg,
      main="Correlation between GA-prediction and observed") # GA
plot( (mtcars$hp * mod$coefficients[2] ) + mod$coefficients[1] , mtcars$mpg,
      main="Correlation between lm-prediction and observed") # LM
# Correlation between GA-prediction and observed 
cor( as.matrix(mtcars[,c("inter","hp")]) %*% t(xa)  , mtcars$mpg) 
# Correlation between lm-prediction and observed
cor( (mtcars$hp * mod$coefficients[2] ) + mod$coefficients[1] , mtcars$mpg) # LM