---
title: 'ILSE: simulated examples'
author: "Wei Liu"
date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{ILSE: simulated examples}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
```


# Load ILSE package

The package can be loaded with the command:
```{r}
library("ILSE")
```

# ILSE can handle (non)missing data with continuous variables
First, we generate a small simulated data.
```{r  eval = FALSE}
  set.seed(1)
  n <- 100
  p <- 6
  X <- MASS::mvrnorm(n, rep(0, p), cor.mat(p, rho=0.5))
  beta0 <- rep(c(1,-1), times=3)
  Y <- -2+ X %*% beta0 + rnorm(n, sd=1)
```

## A special case: without missing values

Then, we fit the linear regression model without missing values based on ILSE.
```{r eval = FALSE}
ilse1 <- ilse(Y~X)
print(ilse1)
```

We can also create a (data.frame) object as input for ILSE.

```{r eval = FALSE}
dat <- data.frame(Y=Y, X=X)
ilse1 <- ilse(Y~., data=dat)
print(ilse1)
Coef(ilse1) # access the coefficients
Fitted.values(ilse1)[1:5]
Residuals(ilse1)[1:5]

```

Check the significant variables by bootstratp.
```{r eval = FALSE}
s1 <- summary(ilse1)
s1
```

## Handle data with missing values
First, we randomly remove some entries in X.
```{r eval = FALSE}
mis_rate <- 0.3
set.seed(1)
na_id <- sample(1:(n*p), n*p*mis_rate)
Xmis <- X
Xmis[na_id] <- NA
ncomp <- sum(complete.cases(Xmis))
message("Number of complete cases is ", ncomp, '\n')
```

Second, we use *lm* to fit linear regression model based on complete cases, i.e., CC analysis. We can not detect any siginificant covariates.

```{r eval = FALSE}
lm1 <- lm(Y~Xmis)
s_cc <- summary.lm(lm1)
s_cc
```

Third, we use *ILSE* to fit the linear regression model based on all data.
We can fit a linear regression model without intercept by setting formula:
```{r eval = FALSE}
ilse2 <- ilse(Y~Xmis+0, data=NULL, verbose=T)
print(ilse2)
```

Then, we  fit a linear regression model with intercept by following command
```{r eval = FALSE}
ilse2 <- ilse(Y~Xmis, data=NULL, verbose=T)
print(ilse2)
```

Fourth, *Bootstrap* is applied to evaluate the standard error and p-values of each coefficients estimated by ILSE. We observe four significant coefficients. 
```{r eval = FALSE}
s2 <- summary(ilse2, Nbt=20)
s2
```


In *ILSE* package, we also provide Full Information Maximum Likelihood for Linear Regression *fimlreg*. We show how to use it to handle the above missing data.
```{r eval = FALSE}
fimllm <- fimlreg(Y~Xmis)
print(fimllm)

```

We also use *bootstrap* to evaluate the standard error and p-values of each coefficients estimated by ILSE. We observe only one significant coefficients. 
```{r eval = FALSE}
s_fiml <- summary(fimllm, Nbt=20)
s_fiml
```

## Visualization 
We visualize the p-vaules of each methods , where red line denotes 0.05 in y-axis and blue line 0.1 in y-axis.
```{r eval = FALSE}
pMat <- cbind(CC=s_cc$coefficients[,4], ILSE=s2[,4], FIML=s_fiml[,4])
library(ggplot2)
df1 <- data.frame(Pval= as.vector(pMat[-1,]),
                    Method =factor(rep(c('CC', "ILSE", "FIML"),each=p)),
                    covariate= factor(rep(paste0("X", 1:p), times=3)))
ggplot(data=df1, aes(x=covariate, y=Pval, fill=Method)) + geom_bar(position = "dodge", stat="identity",width = 0.5) + geom_hline(yintercept = 0.05, color='red') + geom_hline(yintercept = 0.1, color='blue')
```





# ILSE can handle missing data with continuos and categorical variables
Base on the above data, we add a new column, a categorical variable (Sex), into the data.frame.  This variable is not associated with the outcome variable.
```{r  eval = FALSE}
dat <- data.frame(Y=Y, X=Xmis)
dat$Sex <- factor(rep(c('male', 'female'), times=n/2))
dat$Sex[sample(1:n, n*mis_rate)] <- NA
ilse1 <- ilse(Y~., data=dat, verbose = T)
```

We can change the bootstrap times in calculate the standard errors, Z value and p-values of coefficients.

```{r  eval = FALSE}
s3 <- summary(ilse1, Nbt=40)
s3
```


# ILSE can correctly identify the important variables

## generate data
First, we generate data from a linear regression model with three inportant variables(1,3,5) and three unimportant variables(2,4,6). 
```{r  eval = FALSE}
set.seed(10)
n <- 100
p <- 6
X <- MASS::mvrnorm(n, rep(0, p), cor.mat(p, rho=0.5))
beta0 <- rep(c(1,0), times=3)
Y <- -2+ X %*% beta0 + rnorm(n, sd=1)
message("The true regression coefficients are: ", paste0(beta0, '  '))
```

We randomly assign missing values in the design matrix.
```{r  eval = FALSE}
mis_rate <- 0.3
set.seed(1)
na_id <- sample(1:(n*p), n*p*mis_rate)
Xmis <- X
Xmis[na_id] <- NA
```

Next, we use ILSE to fit model.
```{r  eval = FALSE}
dat <- data.frame(Y=Y, X=Xmis)
ilse1 <- ilse(Y~., data=dat, verbose = T)
s3 <- summary(ilse1)
s3
```

Fit model by using lm and FIML, finally compare ILSE with these two methods.
```{r  eval = FALSE}
lm1 <- lm(Y~Xmis)
s_cc <- summary.lm(lm1)
fimllm <- fimlreg(Y~Xmis)
s_fiml <- summary(fimllm)
```

## Visualization
We visualize the p-vaules of each methods , where red line denotes 0.05 in y-axis.  Under significance level 0.05, we found both ILSE and FIML can identify all important variables (X1, X3 and X5), while CC method only identified X1 and X5.
```{r eval = FALSE}
library(ggthemes)
pMat <- cbind(CC=s_cc$coefficients[,4], ILSE=s3[,4], FIML=s_fiml[,4])
df1 <- data.frame(Pval= as.vector(pMat[-1,]),
                    Method =factor(rep(c('CC', "ILSE", "FIML"),each=p)),
                    covariate= factor(rep(paste0("X", 1:p), times=3)))
ggplot(data=df1, aes(x=covariate, y=Pval, fill=Method)) + geom_bar(position = "dodge", stat="identity",width = 0.5) + geom_hline(yintercept = 0.05, color='red') + scale_fill_economist()
```




# ILSE can handle data with high missing rate
Here, we generate a data with 80% missing values, then use ILSE to fit model.
```{r  eval = FALSE}

# generate data from linear model
set.seed(10)
n <- 100
p <- 6
X <- MASS::mvrnorm(n, rep(0, p), cor.mat(p, rho=0.5))
beta0 <- rep(c(1,-1), times=3)
Y <- -2+ X %*% beta0 + rnorm(n, sd=1)

# generate missing values
mis_rate <- 0.8
set.seed(1)
na_id <- sample(1:(n*p), n*p*mis_rate)
Xmis <- X
Xmis[na_id] <- NA
# retain 4 complete cases.
Xmis[1:4,] <- X[1:4, ]
sum(complete.cases(Xmis))
```

CC method will failed.
```{r  eval = FALSE}
lm1 <- lm(Y~Xmis)
summary.lm(lm1)
```

However, ILSE can still work.

```{r  eval = FALSE}
ilse2 <- ilse(Y~Xmis, verbose = T)
s2 <- summary(ilse2)
s2
```


# ILSE can handle large-scale data
We generate a large-scale data with n=1000 and p = 50
```{r  eval = FALSE}
n <- 1000
p <- 50
X <- MASS::mvrnorm(n, rep(0, p), cor.mat(p, rho=0.5))
beta0 <- rep(c(1,-1), length=p)
Y <- -2+ X %*% beta0 + rnorm(n, sd=1)

mis_rate <- 0.3
set.seed(1)
na_id <- sample(1:(n*p), n*p*mis_rate)
Xmis <- X
Xmis[na_id] <- NA


Xmis[1:10,] <- X[1:10,]
lm1 <- lm(Y~Xmis)
lm1
system.time(ilse2 <- ilse(Y~Xmis, data=NULL, verbose=T))
```

# Session information
```{r}
sessionInfo()
```