Cheminformatics/Other/14 Sept Test and Prep.Rmd

---
title: "Test preparation 14 Sept"
author: "Jonathan Herrewijnen"
date: "September 14, 2018"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

## R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see <http://rmarkdown.rstudio.com>.

When you click the **Knit** button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

```{r cars}
summary(cars)
```

## Including Plots

You can also embed plots, for example:

```{r pressure, echo=FALSE}
plot(pressure)
```

Note that the `echo = FALSE` parameter was added to the code chunk to prevent printing of the R code that generated the plot.

```{r}
#Library etc
require(reshape2)

iris$id <- rownames(iris)
iris_long <- melt(iris)

## Using Species, id as id variables
iris_wide <- dcast(formula = Species + id ~ variable, data = iris_long )
head(iris_wide)

#Calculate the average of each measurement for long/wide
head(iris_wide)
aggregate(formula = .~ Species, data = iris_wide[, -2], FUN = mean)

#Offff
aggregate(cbind(Sepal.Length, Sepal.Width) ~Species, data=iris_wide, FUN =mean)

aggregate(formula = value ~ Species + variable, data = iris_long, FUN = mean)

#Create list from data
iris_list <- split(iris, iris$Species)
head(iris_list)

summary_list = alist()
for( i in seq_along(iris_list)){
 summary_list[[i]] <- summary(iris_list[[i]])
}
summary_list

#Create list using lapply()
lapply(iris_list, summary)

#Cars library
head(mtcars)

wt_hp_perCyl <- aggregate(cbind(wt, hp) ~ cyl, data = mtcars, FUN = mean)
wt_hp_perCyl

#merging
merged_result <- merge(mtcars, wt_hp_perCyl, by = "cyl", suffixes = c("", "_mean"))
head(merged_result)

#Selecting
sel_mtcars <- mtcars[ (mtcars$cyl == 6 | mtcars$cyl == 8) & mtcars$gear <= 3 & mtcars$mpg > 19 ,]
sel_mtcars
```

//PLOTS ETC
```{r}
plot(x = iris$Sepal.Length, y = iris$Sepal.Width)
abline(a = 3.5, b = -0.1, col = 'red', lwd = 3)

myFun <- function(input_vector, a ,b) {
y <- a + b*input_vector
return(y)
}

lines(iris$Sepal.Length,myFun(input_vector = iris$Sepal.Length, a = 4, b = -0.15), col = 'blue', cex = 3 )

par(mfrow = c(2, 2)) # for plotting 4 graphs in a single figure.
with(iris, hist(Sepal.Length))
with(iris, hist(Sepal.Width))
with(iris, hist(Petal.Length))
with(iris, plot(iris$Sepal.Length, Petal.Width))

require(ggplot2)
summary(midwest)

#SELEcTING SOMETHING FROM A GRAPH
diamonds[ diamonds$y == 58.9, ]

```

//ACTUAL TEST HERE
```{r}
a <- 7

myFun <- function(x) {
a <- 2
y <- x + a
return(y)
a
}

summary(a)

plot(Sepal.Length~Sepal.Width,data = iris)
plot(Sepal.Length~Species,data = iris)

summary(iris)

my_data <- iris_wide
iris_wide
my_data[[3]]$Variable[8]
iris$Species[51]

arg1 <- 1
arg2 <-2
input <- 3

myFun <- function(arg1, arg2, x) {

  if( is.numeric(c(arg1,arg2))){

  result <- cor(arg1, arg2)

 } else{

 print("Input should be numeric") }

}

y <- myFun(1, 2, x)
y
```

```{r}
myFun <- function(arg1, arg2, input) {

  if( is.numeric(c(arg1,arg2))){

  result <- cor(arg1, arg2)

 } else{

 print("Input should be numeric") }

}

y <- myFun(1, 2, 5)
y+1
cor(1, 2)

a<-1
b<-2

myFun <- function(a, b) {
  result1 <- a * b
  result2 <- a^b
  result3 <- seq(from = a, to = b, length.out = 10)
  my_list <- list()
  my_list[[1]] <- result1
  my_list[[2]] <- result2
  my_list[[3]] <- result3
  return(my_list)
}

myFun(10, 20)

summary(iris)
iris_slct <- iris[(iris$Species == "setosa"  & iris$Sepal.Length<4.5) | (iris$Species =="versicolor" & iris$Sepal.Width>3.5), ]
iris_slct2 <- iris[(iris$Species =="virginica" & iris$Sepal.Width>3.5), ]
iris_slct
iris_slct2

library(reshape2)
require(reshape2)

?sapply
?ggplot
?aes

?aggregate
?by
?split
?reshape2
?cars

split(iris)
```

//THE EXERCISE

dat is the input dataframe
cat_colnames is the column name to summarize by.
sel_colname is the column what will be filtered/ selected over
sel_value is the value we filter/select over
The function should select all rows smaller than the sel_value within the sel_colname column.
The function should calculate the mean of all columns of your selected data per level of your cat_colname column.
The result of the function must be a list containing the result of the filtering and the result of the summarization.
In your R- script also run the function you created, you can use the iris dataset to test.
Save the result of your function to disk using the save() function
Write the function in a R script. Save the file with the same name as the function and with the .R extension.
Upload the .R file and the upload the result of your function to BB.

```{r}
dat <-iris

#To get you started here are some hints:
my_fun <- function(dat, cat_colname, sel_colname, sel_value) {

#to filter/ select using the variable argument use:
dat[dat[ ,sel_colname]<sel_value, ]

#calculate mean in a for loop
  for(i in seq_along(cat_colname)){
    dat2[[i]]<-mean(dat[[i]])
  }

  #to summarize use in the aggregate function:
  dat3 <- as.formula(paste0(". ~", cat_colname))

  my_list2 <-list()
  my_list2[[1]] <-dat2 #This is the mean
  my_list2[[2]] <-dat  #This is the selection
  my_list2[[3]] <-dat3 #This is the summary
}

my_fun(1, 2, 3, 4)
```

//QUESTION 12

Use ggplot to plot the esoph dataset.

In 1 figure create a separate bar plot for each alcohol and age group combination. There are 6 age groups and 4 alcohol groups so there should be 24 bar plots.

Map the ncases and ncontrols to the x-axis (so there are 2 bars per plot). Map the corresponding values to the y-axis. Map the tobacco group to the fill aesthetic.

Save the figure, export as pdf using RStudio. With the export / save as pdf and preview you can easily adjust the size (use larger numbers to fit in more graphs)

If the Rstudio window for plotting is not working you can use pdf(file = , height = 12.., width = 12) {plot here} dev.off()

Don't forget to upload the plot!

Hint: first melt the numeric data to 1 column
```{r}
require(ggplot2)
require(reshape2)

head(midwest)

my_alcohol_plot <- esoph
my_alcohol_plot <- melt(esoph)

head(my_alcohol_plot)

par(mfrow=c(2, 12))

ggplot(data=my_alcohol_plot, aes(x=agegp, y=alcgp), hist)

sapply(esoph[, 1:4], hist)

1+1

require(stats)
require(graphics) # for mosaicplot
summary(esoph)

## effects of alcohol, tobacco and interaction, age-adjusted
model1 <- glm(cbind(ncases, ncontrols) ~ agegp + tobgp * alcgp,
              data = esoph, family = binomial())
anova(model1)
## Try a linear effect of alcohol and tobacco
model2 <- glm(cbind(ncases, ncontrols) ~ agegp + unclass(tobgp)
                                         + unclass(alcgp),
              data = esoph, family = binomial())
summary(model2)
## Re-arrange data for a mosaic plot
ttt <- table(esoph$agegp, esoph$alcgp, esoph$tobgp)
o <- with(esoph, order(tobgp, alcgp, agegp))
ttt[ttt == 1] <- esoph$ncases[o]
tt1 <- table(esoph$agegp, esoph$alcgp, esoph$tobgp)
tt1[tt1 == 1] <- esoph$ncontrols[o]
tt <- array(c(ttt, tt1), c(dim(ttt),2),
            c(dimnames(ttt), list(c("Cancer", "control"))))
mosaicplot(tt, main = "esoph data set", color = TRUE)

str(esoph)
plot.new()
boxplot(x=esoph$agegp, y=esoph$alcgp)
1+1
```