## ----include=FALSE------------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup--------------------------------------------------------------------
# load the package
library("CNVreg")

## ----echo=FALSE, warning=FALSE, message=FALSE---------------------------------
# load packages required for Rmd
library("kableExtra")
library("tidyverse")
library("ggplot2")
library("patchwork")

## -----------------------------------------------------------------------------
# load the example dataset
data("CNVCOVY", package = "CNVreg")

## -----------------------------------------------------------------------------
# view the dataset
summary(CNV)

## -----------------------------------------------------------------------------
# view the dataset
summary(Cov)

## -----------------------------------------------------------------------------
# view the dataset
summary(Y_QT)

## -----------------------------------------------------------------------------
# view the dataset
summary(Y_BT)

## ----warning=FALSE------------------------------------------------------------
# data preprocessing for a quantitative(continuous) outcome Y_QT
frag_data_QT <- prep(CNV = CNV, Y = Y_QT, Z = Cov, rare.out = 0.05)

## -----------------------------------------------------------------------------
# Format of `prep()` funtion output
 str(frag_data_QT)

## ----warning=FALSE------------------------------------------------------------
# data preprocessing with a binary trait
frag_data_BT <- prep(CNV = CNV, Y = Y_BT, Z = Cov, rare.out = 0.05)

## ----eval = FALSE-------------------------------------------------------------
# ## copy frag_data_QT
# frag_data_BT <- frag_data_QT
# 
# ### replace Y with Y_BT in the correct format: ordered named vector
# ### order the sample in Y_BT as in frag_data_QT$Y
# rownames(Y_BT) <- Y_BT$ID
# 
# frag_data_BT$Y <- Y_BT[names(frag_data_QT$Y), "Y"] |> drop()
# names(frag_data_QT$Y) <- rownames(frag_data_QT$Y)

## -----------------------------------------------------------------------------
  QT_TUNE <- withr::with_seed(
    12345,
    cvfit_WTSMTH(data = frag_data_QT, 
                 lambda1 = seq(-8, -3, 1), 
                 lambda2 = seq(12, 25, 2), 
                 weight = "eql", 
                 family = "gaussian",
                 cv.control = list(n.fold = 5L, 
                                   n.core = 1L, 
                                   stratified = FALSE),
                 verbose = FALSE))

## ----echo=FALSE---------------------------------------------------------------
# loss matrix of candidate tuning parameters
 QT_TUNE$Loss %>% format( digits = 6) %>%
  mutate(
    across(2:ncol(QT_TUNE$Loss), 
               ~ cell_spec(.x, 
                           color = ifelse(.x > min(QT_TUNE$Loss[,2:ncol(QT_TUNE$Loss)])+0.0001, "black", "white"), 
                           background = ifelse(.x <= min(QT_TUNE$Loss[, 2:ncol(QT_TUNE$Loss)])+0.0001, "red", "white")
                           )
               )
)%>%
  kable(booktabs = FALSE, linesep = "",  align = "c", format = "html", escape = F,  caption = "Average loss for each pair of candidate tuning parameters") %>%add_header_above(c(" " = 1, "Lambda 1" = ncol(QT_TUNE$Loss)-1))

## -----------------------------------------------------------------------------
# selected optimal tuning parameters with minimum loss
 QT_TUNE$selected.lambda 

## -----------------------------------------------------------------------------
##coefficients of intercept and covariates 
QT_TUNE$coef[c(1, 21:22), c("Vnames", "coef") ] 

## -----------------------------------------------------------------------------
# estimated coefficents for CNV
QT_TUNE$coef[2:20, ]
# non-zero coefficients 
# QT_TUNE$coef[which(abs(QT_TUNE$coef$coef)>0), ] 

## ----warning=FALSE, echo=FALSE------------------------------------------------
# plot the coefficients 
# keep CNV fragments and exclude intercept and covariates(Age, Sex) in ploting, row(2:20)

CNVR <- frag_data_QT$CNVR.info %>% group_by(CNV.id, deldup)%>%
  summarise(CNV.start = min(CNV.start), 
            CNV.end = max(CNV.end), 
            nfrag = length(CNV.id), 
            .groups = 'drop')
CNVR_adj <- CNVR[CNVR$nfrag > 1, ]
CNV_coef <- QT_TUNE$coef[2:20,]


P <- ggplot(CNV_coef, aes(x= CNV_coef$CNV.start , y=CNV_coef$coef*1000))+theme_bw()+ 
  geom_point(aes(color = ifelse(abs(CNV_coef$coef*1000) > 1, "red", ifelse(abs(CNV_coef$coef*1000) < 10^(-5), "white", "black")))) + 
  scale_color_identity()+
  theme(axis.text.x = element_text(size = 10, angle = 25, vjust = 1, hjust = 1))+
  scale_x_continuous("Genomic position", labels = as.character(CNVR_adj$CNV.start), breaks = CNVR_adj$CNV.start)+
  ylab("CNV coefficients (×10^(-3))")+
  geom_segment(x=CNV_coef$CNV.start, xend=CNV_coef$CNV.end, y=CNV_coef$coef*1000, yend=CNV_coef$coef*1000)+
  geom_rect(aes(xmin=CNVR_adj$CNV.start[1]-1000000, xmax=CNVR_adj$CNV.end[1]+1000000, ymin=min(CNV_coef$coef*1000), ymax = max(CNV_coef$coef*1000)), fill="yellow", alpha = 0.02)+
  annotate("text", x = CNVR_adj$CNV.start[1], y = max(CNV_coef$coef*1000) +0.2, label = "A", color="red")+
  geom_rect(aes(xmin=CNVR_adj$CNV.start[2]-1000000, xmax=CNVR_adj$CNV.end[2]+1000000, ymin=min(CNV_coef$coef*1000), ymax = max(CNV_coef$coef*1000)), fill="yellow", alpha = 0.02)+
  geom_rect(aes(xmin=CNVR_adj$CNV.start[3]-1000000, xmax=CNVR_adj$CNV.end[3]+1000000, ymin=min(CNV_coef$coef*1000), ymax = max(CNV_coef$coef*1000)), fill="yellow", alpha = 0.02)+
  annotate("text", x = CNVR_adj$CNV.start[3], y = max(CNV_coef$coef*1000)+0.2, label = "B", color="red")+
  geom_rect(aes(xmin=CNVR_adj$CNV.start[4]-1000000, xmax=CNVR_adj$CNV.end[4]+1000000, ymin=min(CNV_coef$coef*1000), ymax = max(CNV_coef$coef*1000)), fill="yellow", alpha = 0.02)

    

PA <- ggplot(CNV_coef[2:5,], aes(x= 1/2*(CNV_coef$CNV.start[2:5]+CNV_coef$CNV.end[2:5]) , y=CNV_coef$coef[2:5]*1000))+theme_bw()+ 
  ylab("")+
  geom_point(aes(color = ifelse(abs(CNV_coef$coef[2:5]*1000) > 1, "red", ifelse(abs(CNV_coef$coef[2:5]) < 10^(-5), "white", "black")))) + 
  scale_color_identity()+
  theme(axis.text.x = element_text(size = 10, angle = 25, vjust = 1, hjust = 1))+
  scale_x_continuous("", labels = as.character(CNV_coef$CNV.start[2:5]), breaks = CNV_coef$CNV.start[2:5])+
  geom_segment(x=CNV_coef$CNV.start[2:5], xend=CNV_coef$CNV.end[2:5], y=CNV_coef$coef[2:5]*1000, yend=CNV_coef$coef[2:5]*1000)+
  geom_rect(aes(xmin=CNVR_adj$CNV.start[1]-10, xmax=CNVR_adj$CNV.end[1]+10, ymin=min(CNV_coef$coef*1000), ymax = max(CNV_coef$coef*1000)), fill="yellow", alpha = 0.02)+
  annotate("text", x =1/2*( CNVR_adj$CNV.start[1]+CNVR_adj$CNV.end[1]), y = max(CNV_coef$coef*1000) +0.0002, label = "Zoom in on region A", color="red")
  

PB <- ggplot(CNV_coef[11:14,], aes(x= 1/2*(CNV_coef$CNV.start[11:14]+CNV_coef$CNV.end[11:14]) , y=CNV_coef$coef[11:14]*1000))+theme_bw()+ 
  ylab("")+
  geom_point(aes(color = ifelse(abs(CNV_coef$coef[11:14]*1000) > 1, "red", ifelse(abs(CNV_coef$coef[11:14]*1000) < 10^(-5), "white", "black")))) + 
  scale_color_identity()+
  theme(axis.text.x = element_text(size = 10, angle = 25, vjust = 1, hjust = 1))+
  scale_x_continuous("Genomic position", labels = as.character(CNV_coef$CNV.start[11:14]), breaks = CNV_coef$CNV.start[11:14])+
  geom_segment(x=CNV_coef$CNV.start[11:14], xend=CNV_coef$CNV.end[11:14], y=CNV_coef$coef[11:14]*1000, yend=CNV_coef$coef[11:14]*1000)+
  geom_rect(aes(xmin=CNVR_adj$CNV.start[3]-10, xmax=CNVR_adj$CNV.end[3]+10, ymin=min(CNV_coef$coef*1000), ymax = max(CNV_coef$coef*1000)), fill="yellow", alpha = 0.02)+
  annotate("text", x =1/2*( CNVR_adj$CNV.start[3]+CNVR_adj$CNV.end[3]), y = max(CNV_coef$coef*1000) + 0.2, label = "Zoom in on region B", color="red")


P + (PA/PB) + 
  plot_annotation(title = "CNV coefficient estiamte across the genomic region - A fine-tuned model")+ 
  plot_layout(axes = "collect", widths = c(2,1)) + 
  plot_layout( guide = "collect") & theme(legend.position="Top",
                      legend.text = element_text(size=12), legend.title = element_text(size=12)) 

## -----------------------------------------------------------------------------
BT_TUNE <- withr::with_seed(
  12345,
  cvfit_WTSMTH(frag_data_BT, 
               lambda1 = seq(-5.25, -4.75, 0.25), 
               lambda2 = seq(2,  8, 2), 
               weight = "eql",
               family = "binomial", 
               cv.control = list(n.fold = 5L, 
                                 n.core = 1L, 
                                 stratified = FALSE),
               iter.control = list(max.iter = 8L, 
                                   tol.beta = 10^(-3), 
                                   tol.loss = 10^(-6)), 
               verbose = FALSE))


## ----echo=FALSE---------------------------------------------------------------
# loss matrix of candidate tuning parameters
 
BT_TUNE$Loss %>% format( digits = 6) %>%
  mutate(
    across(2:ncol(BT_TUNE$Loss), 
               ~ cell_spec(.x, 
                           color = ifelse(.x > min(BT_TUNE$Loss[,2:ncol(BT_TUNE$Loss)])+0.000001, "black", "white"), 
                           background = ifelse(.x <= min(BT_TUNE$Loss[, 2:ncol(BT_TUNE$Loss)])+0.000001, "red", "white")
                           )
               )
)%>%
  kable(booktabs = FALSE, linesep = "",  align = "c", format = "html", escape = F,  caption = "Average loss for each pair of candidate tuning parameters") %>%add_header_above(c(" " = 1, "Lambda 1" = ncol(BT_TUNE$Loss)-1))

## -----------------------------------------------------------------------------
# selected optimal tuning parameters with minimum loss
 BT_TUNE$selected.lambda 

## -----------------------------------------------------------------------------

BT_TUNE$coef[c(1, 21:22), c("Vnames", "coef") ]


## -----------------------------------------------------------------------------
BT_TUNE$coef[2:20, ]

## ----warning=FALSE, echo=FALSE------------------------------------------------
CNV_coef <- BT_TUNE$coef[2:20,]


P <- ggplot(CNV_coef, aes(x= CNV_coef$CNV.start , y=CNV_coef$coef*1000))+theme_bw()+ 
  geom_point(aes(color = ifelse(abs(CNV_coef$coef*1000) > 1, "red", ifelse(abs(CNV_coef$coef*1000) < 10^(-5), "white", "black")))) + 
  scale_color_identity()+
  theme(axis.text.x = element_text(size = 10, angle = 25, vjust = 1, hjust = 1))+
  scale_x_continuous("Genomic position", labels = as.character(CNVR_adj$CNV.start), breaks = CNVR_adj$CNV.start)+
  ylab("CNV coefficients (×10^(-3))")+
  geom_segment(x=CNV_coef$CNV.start, xend=CNV_coef$CNV.end, y=CNV_coef$coef*1000, yend=CNV_coef$coef*1000)+
  geom_rect(aes(xmin=CNVR_adj$CNV.start[1]-1000000, xmax=CNVR_adj$CNV.end[1]+1000000, ymin=min(CNV_coef$coef*1000), ymax = max(CNV_coef$coef*1000)), fill="yellow", alpha = 0.02)+
  annotate("text", x = CNVR_adj$CNV.start[1], y = max(CNV_coef$coef*1000)+0.2, label = "A", color="red")+
  geom_rect(aes(xmin=CNVR_adj$CNV.start[2]-1000000, xmax=CNVR_adj$CNV.end[2]+1000000, ymin=min(CNV_coef$coef*1000), ymax = max(CNV_coef$coef*1000)), fill="yellow", alpha = 0.02)+
  geom_rect(aes(xmin=CNVR_adj$CNV.start[3]-1000000, xmax=CNVR_adj$CNV.end[3]+1000000, ymin=min(CNV_coef$coef*1000), ymax = max(CNV_coef$coef*1000)), fill="yellow", alpha = 0.02)+
  annotate("text", x = CNVR_adj$CNV.start[3], y = max(CNV_coef$coef*1000)+0.2, label = "B", color="red")+
  geom_rect(aes(xmin=CNVR_adj$CNV.start[4]-1000000, xmax=CNVR_adj$CNV.end[4]+1000000, ymin=min(CNV_coef$coef*1000), ymax = max(CNV_coef$coef*1000)), fill="yellow", alpha = 0.02)

    

PA <- ggplot(CNV_coef[2:5,], aes(x= 1/2*(CNV_coef$CNV.start[2:5]+CNV_coef$CNV.end[2:5]) , y=CNV_coef$coef[2:5]*1000))+theme_bw()+ 
  ylab("")+
  geom_point(aes(color = ifelse(abs(CNV_coef$coef[2:5]*1000) > 0.001, "red", ifelse(abs(CNV_coef$coef[2:5]*1000) < 10^(-8), "white", "black")))) + 
  scale_color_identity()+
  theme(axis.text.x = element_text(size = 10, angle = 25, vjust = 1, hjust = 1))+
  scale_x_continuous("", labels = as.character(CNV_coef$CNV.start[2:5]), breaks = CNV_coef$CNV.start[2:5])+
  geom_segment(x=CNV_coef$CNV.start[2:5], xend=CNV_coef$CNV.end[2:5], y=CNV_coef$coef[2:5]*1000, yend=CNV_coef$coef[2:5]*1000)+
  geom_rect(aes(xmin=CNVR_adj$CNV.start[1]-10, xmax=CNVR_adj$CNV.end[1]+10, ymin=min(CNV_coef$coef*1000), ymax = max(CNV_coef$coef*1000)), fill="yellow", alpha = 0.02)+
  annotate("text", x =1/2*( CNVR_adj$CNV.start[1]+CNVR_adj$CNV.end[1]), y = max(CNV_coef$coef*1000) + 0.4, label = "Zoom in on region A", color="red")
  

PB <- ggplot(CNV_coef[11:14,], aes(x= 1/2*(CNV_coef$CNV.start[11:14]+CNV_coef$CNV.end[11:14]) , y=CNV_coef$coef[11:14]*1000))+theme_bw()+ 
  ylab("")+
  geom_point(aes(color = ifelse(abs(CNV_coef$coef[11:14]*1000) > 1, "red", ifelse(abs(CNV_coef$coef[11:14]*1000) < 10^(-5), "white", "black")))) + 
  scale_color_identity()+
  theme(axis.text.x = element_text(size = 10, angle = 25, vjust = 1, hjust = 1))+
  scale_x_continuous("Genomic position", labels = as.character(CNV_coef$CNV.start[11:14]), breaks = CNV_coef$CNV.start[11:14])+
  geom_segment(x=CNV_coef$CNV.start[11:14], xend=CNV_coef$CNV.end[11:14], y=CNV_coef$coef[11:14]*1000, yend=CNV_coef$coef[11:14]*1000)+
  geom_rect(aes(xmin=CNVR_adj$CNV.start[3]-10, xmax=CNVR_adj$CNV.end[3]+10, ymin=min(CNV_coef$coef*1000), ymax = max(CNV_coef$coef*1000)), fill="yellow", alpha = 0.02)+
  annotate("text", x =1/2*( CNVR_adj$CNV.start[3]+CNVR_adj$CNV.end[3]), y = max(CNV_coef$coef*1000) +0.4, label = "Zoom in on region B", color="red")


P + (PA/PB) +plot_annotation(title = "CNV coefficient estiamte across the genomic region - A fine-tuned model")+
  plot_layout(axes = "collect", widths = c(2,1)) + plot_layout( guide = "collect") & theme(legend.position="Top",
                      legend.text = element_text(size=12), legend.title = element_text(size=12)) 

## -----------------------------------------------------------------------------
# we know the optimal tuning parameters and directly apply it to reproduce the analysis for a continuous outcome.
QT_fit <- fit_WTSMTH(frag_data_QT, 
                      lambda1 = -5, 
                      lambda2 = 20, 
                      weight="eql",
                      family="gaussian")

## -----------------------------------------------------------------------------

QT_fit[c(1, 21:22), c("Vnames", "coef") ]


## -----------------------------------------------------------------------------
QT_fit[2:20, ]