312 lines
7.5 KiB
Plaintext
312 lines
7.5 KiB
Plaintext
|
---
|
||
|
title: "Test preparation 14 Sept"
|
||
|
author: "Jonathan Herrewijnen"
|
||
|
date: "September 14, 2018"
|
||
|
output: html_document
|
||
|
---
|
||
|
|
||
|
```{r setup, include=FALSE}
|
||
|
knitr::opts_chunk$set(echo = TRUE)
|
||
|
```
|
||
|
|
||
|
## R Markdown
|
||
|
|
||
|
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see <http://rmarkdown.rstudio.com>.
|
||
|
|
||
|
When you click the **Knit** button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
|
||
|
|
||
|
```{r cars}
|
||
|
summary(cars)
|
||
|
```
|
||
|
|
||
|
## Including Plots
|
||
|
|
||
|
You can also embed plots, for example:
|
||
|
|
||
|
```{r pressure, echo=FALSE}
|
||
|
plot(pressure)
|
||
|
```
|
||
|
|
||
|
Note that the `echo = FALSE` parameter was added to the code chunk to prevent printing of the R code that generated the plot.
|
||
|
|
||
|
```{r}
|
||
|
#Library etc
|
||
|
require(reshape2)
|
||
|
|
||
|
iris$id <- rownames(iris)
|
||
|
iris_long <- melt(iris)
|
||
|
|
||
|
## Using Species, id as id variables
|
||
|
iris_wide <- dcast(formula = Species + id ~ variable, data = iris_long )
|
||
|
head(iris_wide)
|
||
|
|
||
|
#Calculate the average of each measurement for long/wide
|
||
|
head(iris_wide)
|
||
|
aggregate(formula = .~ Species, data = iris_wide[, -2], FUN = mean)
|
||
|
|
||
|
#Offff
|
||
|
aggregate(cbind(Sepal.Length, Sepal.Width) ~Species, data=iris_wide, FUN =mean)
|
||
|
|
||
|
aggregate(formula = value ~ Species + variable, data = iris_long, FUN = mean)
|
||
|
|
||
|
#Create list from data
|
||
|
iris_list <- split(iris, iris$Species)
|
||
|
head(iris_list)
|
||
|
|
||
|
summary_list = alist()
|
||
|
for( i in seq_along(iris_list)){
|
||
|
summary_list[[i]] <- summary(iris_list[[i]])
|
||
|
}
|
||
|
summary_list
|
||
|
|
||
|
#Create list using lapply()
|
||
|
lapply(iris_list, summary)
|
||
|
|
||
|
#Cars library
|
||
|
head(mtcars)
|
||
|
|
||
|
wt_hp_perCyl <- aggregate(cbind(wt, hp) ~ cyl, data = mtcars, FUN = mean)
|
||
|
wt_hp_perCyl
|
||
|
|
||
|
#merging
|
||
|
merged_result <- merge(mtcars, wt_hp_perCyl, by = "cyl", suffixes = c("", "_mean"))
|
||
|
head(merged_result)
|
||
|
|
||
|
#Selecting
|
||
|
sel_mtcars <- mtcars[ (mtcars$cyl == 6 | mtcars$cyl == 8) & mtcars$gear <= 3 & mtcars$mpg > 19 ,]
|
||
|
sel_mtcars
|
||
|
```
|
||
|
|
||
|
//PLOTS ETC
|
||
|
```{r}
|
||
|
plot(x = iris$Sepal.Length, y = iris$Sepal.Width)
|
||
|
abline(a = 3.5, b = -0.1, col = 'red', lwd = 3)
|
||
|
|
||
|
myFun <- function(input_vector, a ,b) {
|
||
|
y <- a + b*input_vector
|
||
|
return(y)
|
||
|
}
|
||
|
|
||
|
lines(iris$Sepal.Length,myFun(input_vector = iris$Sepal.Length, a = 4, b = -0.15), col = 'blue', cex = 3 )
|
||
|
|
||
|
par(mfrow = c(2, 2)) # for plotting 4 graphs in a single figure.
|
||
|
with(iris, hist(Sepal.Length))
|
||
|
with(iris, hist(Sepal.Width))
|
||
|
with(iris, hist(Petal.Length))
|
||
|
with(iris, plot(iris$Sepal.Length, Petal.Width))
|
||
|
|
||
|
require(ggplot2)
|
||
|
summary(midwest)
|
||
|
|
||
|
#SELEcTING SOMETHING FROM A GRAPH
|
||
|
diamonds[ diamonds$y == 58.9, ]
|
||
|
|
||
|
```
|
||
|
|
||
|
//ACTUAL TEST HERE
|
||
|
```{r}
|
||
|
a <- 7
|
||
|
|
||
|
myFun <- function(x) {
|
||
|
a <- 2
|
||
|
y <- x + a
|
||
|
return(y)
|
||
|
a
|
||
|
}
|
||
|
|
||
|
summary(a)
|
||
|
|
||
|
plot(Sepal.Length~Sepal.Width,data = iris)
|
||
|
plot(Sepal.Length~Species,data = iris)
|
||
|
|
||
|
summary(iris)
|
||
|
|
||
|
my_data <- iris_wide
|
||
|
iris_wide
|
||
|
my_data[[3]]$Variable[8]
|
||
|
iris$Species[51]
|
||
|
|
||
|
arg1 <- 1
|
||
|
arg2 <-2
|
||
|
input <- 3
|
||
|
|
||
|
myFun <- function(arg1, arg2, x) {
|
||
|
|
||
|
if( is.numeric(c(arg1,arg2))){
|
||
|
|
||
|
result <- cor(arg1, arg2)
|
||
|
|
||
|
} else{
|
||
|
|
||
|
print("Input should be numeric") }
|
||
|
|
||
|
}
|
||
|
|
||
|
y <- myFun(1, 2, x)
|
||
|
y
|
||
|
```
|
||
|
|
||
|
```{r}
|
||
|
myFun <- function(arg1, arg2, input) {
|
||
|
|
||
|
if( is.numeric(c(arg1,arg2))){
|
||
|
|
||
|
result <- cor(arg1, arg2)
|
||
|
|
||
|
} else{
|
||
|
|
||
|
print("Input should be numeric") }
|
||
|
|
||
|
}
|
||
|
|
||
|
y <- myFun(1, 2, 5)
|
||
|
y+1
|
||
|
cor(1, 2)
|
||
|
|
||
|
a<-1
|
||
|
b<-2
|
||
|
|
||
|
myFun <- function(a, b) {
|
||
|
result1 <- a * b
|
||
|
result2 <- a^b
|
||
|
result3 <- seq(from = a, to = b, length.out = 10)
|
||
|
my_list <- list()
|
||
|
my_list[[1]] <- result1
|
||
|
my_list[[2]] <- result2
|
||
|
my_list[[3]] <- result3
|
||
|
return(my_list)
|
||
|
}
|
||
|
|
||
|
myFun(10, 20)
|
||
|
|
||
|
summary(iris)
|
||
|
iris_slct <- iris[(iris$Species == "setosa" & iris$Sepal.Length<4.5) | (iris$Species =="versicolor" & iris$Sepal.Width>3.5), ]
|
||
|
iris_slct2 <- iris[(iris$Species =="virginica" & iris$Sepal.Width>3.5), ]
|
||
|
iris_slct
|
||
|
iris_slct2
|
||
|
|
||
|
library(reshape2)
|
||
|
require(reshape2)
|
||
|
|
||
|
?sapply
|
||
|
?ggplot
|
||
|
?aes
|
||
|
|
||
|
?aggregate
|
||
|
?by
|
||
|
?split
|
||
|
?reshape2
|
||
|
?cars
|
||
|
|
||
|
split(iris)
|
||
|
```
|
||
|
|
||
|
//THE EXERCISE
|
||
|
|
||
|
dat is the input dataframe
|
||
|
cat_colnames is the column name to summarize by.
|
||
|
sel_colname is the column what will be filtered/ selected over
|
||
|
sel_value is the value we filter/select over
|
||
|
The function should select all rows smaller than the sel_value within the sel_colname column.
|
||
|
The function should calculate the mean of all columns of your selected data per level of your cat_colname column.
|
||
|
The result of the function must be a list containing the result of the filtering and the result of the summarization.
|
||
|
In your R- script also run the function you created, you can use the iris dataset to test.
|
||
|
Save the result of your function to disk using the save() function
|
||
|
Write the function in a R script. Save the file with the same name as the function and with the .R extension.
|
||
|
Upload the .R file and the upload the result of your function to BB.
|
||
|
|
||
|
```{r}
|
||
|
dat <-iris
|
||
|
|
||
|
#To get you started here are some hints:
|
||
|
my_fun <- function(dat, cat_colname, sel_colname, sel_value) {
|
||
|
|
||
|
#to filter/ select using the variable argument use:
|
||
|
dat[dat[ ,sel_colname]<sel_value, ]
|
||
|
|
||
|
#calculate mean in a for loop
|
||
|
for(i in seq_along(cat_colname)){
|
||
|
dat2[[i]]<-mean(dat[[i]])
|
||
|
}
|
||
|
|
||
|
#to summarize use in the aggregate function:
|
||
|
dat3 <- as.formula(paste0(". ~", cat_colname))
|
||
|
|
||
|
my_list2 <-list()
|
||
|
my_list2[[1]] <-dat2 #This is the mean
|
||
|
my_list2[[2]] <-dat #This is the selection
|
||
|
my_list2[[3]] <-dat3 #This is the summary
|
||
|
}
|
||
|
|
||
|
my_fun(1, 2, 3, 4)
|
||
|
```
|
||
|
|
||
|
//QUESTION 12
|
||
|
|
||
|
Use ggplot to plot the esoph dataset.
|
||
|
|
||
|
In 1 figure create a separate bar plot for each alcohol and age group combination. There are 6 age groups and 4 alcohol groups so there should be 24 bar plots.
|
||
|
|
||
|
Map the ncases and ncontrols to the x-axis (so there are 2 bars per plot). Map the corresponding values to the y-axis. Map the tobacco group to the fill aesthetic.
|
||
|
|
||
|
Save the figure, export as pdf using RStudio. With the export / save as pdf and preview you can easily adjust the size (use larger numbers to fit in more graphs)
|
||
|
|
||
|
If the Rstudio window for plotting is not working you can use pdf(file = , height = 12.., width = 12) {plot here} dev.off()
|
||
|
|
||
|
Don't forget to upload the plot!
|
||
|
|
||
|
Hint: first melt the numeric data to 1 column
|
||
|
```{r}
|
||
|
require(ggplot2)
|
||
|
require(reshape2)
|
||
|
|
||
|
head(midwest)
|
||
|
|
||
|
my_alcohol_plot <- esoph
|
||
|
my_alcohol_plot <- melt(esoph)
|
||
|
|
||
|
head(my_alcohol_plot)
|
||
|
|
||
|
par(mfrow=c(2, 12))
|
||
|
|
||
|
ggplot(data=my_alcohol_plot, aes(x=agegp, y=alcgp), hist)
|
||
|
|
||
|
sapply(esoph[, 1:4], hist)
|
||
|
|
||
|
1+1
|
||
|
|
||
|
require(stats)
|
||
|
require(graphics) # for mosaicplot
|
||
|
summary(esoph)
|
||
|
|
||
|
## effects of alcohol, tobacco and interaction, age-adjusted
|
||
|
model1 <- glm(cbind(ncases, ncontrols) ~ agegp + tobgp * alcgp,
|
||
|
data = esoph, family = binomial())
|
||
|
anova(model1)
|
||
|
## Try a linear effect of alcohol and tobacco
|
||
|
model2 <- glm(cbind(ncases, ncontrols) ~ agegp + unclass(tobgp)
|
||
|
+ unclass(alcgp),
|
||
|
data = esoph, family = binomial())
|
||
|
summary(model2)
|
||
|
## Re-arrange data for a mosaic plot
|
||
|
ttt <- table(esoph$agegp, esoph$alcgp, esoph$tobgp)
|
||
|
o <- with(esoph, order(tobgp, alcgp, agegp))
|
||
|
ttt[ttt == 1] <- esoph$ncases[o]
|
||
|
tt1 <- table(esoph$agegp, esoph$alcgp, esoph$tobgp)
|
||
|
tt1[tt1 == 1] <- esoph$ncontrols[o]
|
||
|
tt <- array(c(ttt, tt1), c(dim(ttt),2),
|
||
|
c(dimnames(ttt), list(c("Cancer", "control"))))
|
||
|
mosaicplot(tt, main = "esoph data set", color = TRUE)
|
||
|
|
||
|
str(esoph)
|
||
|
plot.new()
|
||
|
boxplot(x=esoph$agegp, y=esoph$alcgp)
|
||
|
1+1
|
||
|
```
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|