# this is a comment
# use R as a calculator

3^2
#install.packages("tidyverse")
library(tidyverse)
library(survival)

# read data from a file on your computer (put the path within quotes)
# NB: for windows user use either "\\" or "/" instead of "\"
#orca = read.table("/Users/alessiocrippa/Dropbox/oralca.txt")
orca = read.table("http://www.stats4life.se/data/oralca.txt")

# How many people were still alive at the end of the follow-up?
# How many died because of oral cancer?
orca$event
tab = table(orca$event)
# what about percentages?
tabprop = prop.table(tab)
tabprop

# create a new variable
orca = mutate(orca, dead = event!= "Alive")
table(orca$dead)
# check if the variable was properly defined
table(orca$event, orca$dead)

# type ?name_of_the_function to visit the help page
?Surv
# create a survival object
all = Surv(orca$time, orca$dead)
all

# estimate the survival function using the KM estimator
fitkm = survfit(all ~ 1, data = orca)
fitkm
# different from the follow-up time
summary(orca$time)

#install.packages("survminer")
library(survminer)
ggsurvplot(fitkm)
ggsave("kmfigure.pdf")

#ggsave("/Users/alessiocrippa/Dropbox/kmfigure.pdf")
# know the working directory

# where R is saving/looking for files by default (i.e. in the working directory)
getwd()

# putting percentage on the y-asix
ggsurvplot(fitkm, risk.table = T, censor = F, fun = "pct",
           surv.median.line = "hv")
# or alternatively
library(scales)
ggsurvplot(fitkm)$plot +
  scale_y_continuous(labels = percent)


# long life-table
summary(fitkm)

# install.packages("ggfortify")
library(ggfortify)

datkm = fortify(fitkm)
ggplot(datkm, aes(x = time)) +
  geom_step(aes(y = surv)) +
  geom_step(aes(y = upper), linetype = "dotted") +
  geom_step(aes(y = lower), linetype = "dotted") +
  theme_classic() +
  scale_y_continuous(labels = percent) +
  labs(x = "Time, years", y = "Survival (%)")

# measures of central tendency
quantile(fitkm, 0.25)


# compare the survival curves across different levels of diagnostic stage
table(orca$stage)
fitkm2 = survfit(Surv(time, dead) ~ stage, data = orca)
fitkm2

# producing a nice figure (put conf.int = F to remove the confidence bands)
ggsurvplot(fitkm2, risk.table = T, conf.int = T)

# to test if there are differences according to tumoral stage
survdiff(Surv(time, dead) ~ stage, data = orca)

# fit a cox model for modelling the (log) hazard as a function of several covariates 
fit1 = coxph(Surv(time, dead) ~ stage + sex + age, data = orca)
fit1
summary(fit1)

#install.packages("Epi")
library(Epi)
ci.exp(fit1)

fit1 = coxph(Surv(time, dead) ~ stage + strata(sex) + I(age/10), 
             data = orca)
summary(fit1)
ci.exp(fit1)

# check the proportionality assumption
proptest = cox.zph(fit1)
proptest

# graph for checking the Schoenfeld residuals
library(survminer)
ggcoxzph(proptest)