Load the dataset hepatitis.csv.
epatite<- read.csv2("/Users/Davide/Desktop/tutor/Hepatits/hepatitis.csv", sep=";", na.strings = "?")
dim(epatite)
2. Proceed with dataset inspection and assign the correct class to variables if necessary.
str(epatite)
epatite[,c(1,3:12)]<-lapply(epatite[,c(1,3:12)],as.factor)
str(epatite)
We have converted the categorical variables that were mistakenly classified as int or character into factors,
# and we have converted the variables that were mistakenly classified into numeric.
3. Perform missing data analysis: Assess where it is necessary to remove missing data and where data imputation is required.
# Let's check for missing data
table(is.na(hepatitis))
# There are 122 missing data points. Let's understand their distribution to determine the most appropriate resolution.
library(VIM)
aggr_plot <- aggr(hepatitis, col = c('navyblue', 'red'), numbers = TRUE,
sortVars = TRUE, labels = names(hepatitis), cex.axis = 0.7, gap = 3,
ylab = c("Histogram of missing data", "Pattern"))
# The variable most affected by the presence of missing data is 'urea.'
# The variable most affected is: urea. We can remove this variable and impute the other missing data with the mean:
hepatitis1 <- hepatitis[, -17]
table(is.na(hepatitis1))
# Now, we have 55 missing data points distributed in these variables: bilirubin, alk.phosphate, aspartate.transaminase, and albumin. Impute them with the mean:
hepatitis1$bilirubin[is.na(hepatitis1$bilirubin)] <- mean(hepatitis1$bilirubin, na.rm = TRUE)
hepatitis1$alk.phosphate[is.na(hepatitis1$alk.phosphate)] <- mean(hepatitis1$alk.phosphate, na.rm = TRUE)
hepatitis1$aspartate.transaminase[is.na(hepatitis1$aspartate.transaminase)] <- mean(hepatitis1$aspartate.transaminase, na.rm = TRUE)
hepatitis1$albumin[is.na(hepatitis1$albumin)] <- mean(hepatitis1$albumin, na.rm = TRUE)
table(is.na(hepatitis1)) # No missing data remains
4. How many women are there with an age less than 38 who use steroids? (sex=2, steroids=2)
donne_steroidi<-as.data.frame(subset(epatite1, epatite1$sex=="2" & epatite1$age<38 & epatite1$steroidi=="2"))
dim(donne_steroidi) # 3 donne
5. Reclassify the 'class' variable using the 'level' function (2:dead, 1:alive).
levels(epatite1$CLASS)
#assegniamo vivo ad 1 e morto a 2
levels(epatite1$CLASS)<-c("vivo","morto")
6. Calculate the mean, mode, and median for the numerical variables in the dataset, and calculate the number of subjects in the following categorical variables: sex, steroids, fatigue, malaise.
# Numerical variables:
summary(hepatitis1[, c(2, 13:16)])
# Categorical variables:
table(hepatitis1$sex)
table(hepatitis1$steroids)
table(hepatitis1$fatigue)
table(hepatitis1$malaise)
# Alternatively:
table1(~ bilirubin + age + alk.phosphate + aspartate.transaminase + albumin, data = hepatitis1)
table1(~ sex + steroids + fatigue + malaise, data = hepatitis1)
7. Evaluate if the number of deceased subjects is significantly different between those who have taken STEROIDS and those who haven't.
hisq.test(table(hepatitis1$CLASS, hepatitis1$steroids))
# The p-value is greater than 0.05, so there is no significant difference between those who have taken steroids and those who haven't.
8. Assess if albumin levels are statistically different between steroid users and non-users using the appropriate test, justifying the test choice, and represent it graphically.
hisq.test(table(hepatitis1$CLASS, hepatitis1$steroids))
# The p-value is greater than 0.05, so there is no significant difference between those who have taken steroids and those who haven't.
9. Choose the most appropriate graph to visualize the relationship between albumin and alk.phosphate.
# First, I assess if the 'albumin' variable is normally distributed:
shapiro.test(hepatitis1$albumin)
# The p-value is less than 0.05, so we reject the null hypothesis of normality and apply a non-parametric test:
wilcox.test(albumin ~ steroids, data = hepatitis1, paired = FALSE)
# p-value = 0.0011: we reject the null hypothesis and, therefore, we conclude that albumin levels are statistically different between those who use steroids and those who don't. Let's try to visualize this:
levels(hepatitis1$steroids) <- c("no", "yes")
boxplot(albumin ~ steroids, data = hepatitis1)
10. Select the most suitable graph to display the frequencies of antiviral usage.
barplot(prop.table(table(epatite2$antivirali)), main= "Utilizzo antivirali", xlab = "Gruppi", ylab = "Frequenze")
#or
ggplot(epatite2, aes(factor(antivirali),
fill = factor(antivirali))) +
geom_bar(aes(y = (..count..)/sum(..count..))) + ggtitle("Frequenze utilizzo antivirali")
11. Create a graph to depict the median levels of aspartate in the two classes, dead and alive, and in the use of steroids.
# Boxplot of 'aspartate.transaminase' levels by 'CLASS'
boxplot(hepatitis1$aspartate.transaminase ~ hepatitis1$CLASS,
main = "Aspartate Levels in Two Classes",
xlab = "Groups", ylab = "Aspartate",
ylim = c(0, 280))
## A cleaner version
library(table1)
library(ggplot2)
library(ggpubr)
ggboxplot(hepatitis1, x = "CLASS", y = "aspartate.transaminase",
color = "CLASS", palette = "jco",
add = "jitter", ylim = c(0, 280))
12. Is there a significant relationship between bilirubin and phosphatase?
# Model:
model <- glm(bilirubin ~ alk.phosphate, data = hepatitis1, family = "gaussian")
summary(model)
# p-value = 0.0496: SIGNIFICANT. There is a significant relationship between bilirubin and phosphatase.
# B0: 1.001432 (the value of Y when X=0)
# B1: 0.004045 (the change in Y for a one-unit increase in X)
# An alternative approach:
# 1) Check the normality of both variables:
shapiro.test(hepatitis1$bilirubin)
shapiro.test(hepatitis1$alk.phosphate)
# Since they are not normally distributed, use the Spearman method:
cor.test(hepatitis1$bilirubin, hepatitis1$alk.phosphate, method = "spearman")
# p-value less than 0.05: there is a relationship between bilirubin and phosphates.
13. Evaluate if there is a significant relationship between CLASS and albumin, considering the steroid variable as a confounder.
# Summary of the model:
summary(modello <- glm(hepatitis1$CLASS ~ hepatitis1$albumin + hepatitis1$steroids, family = "binomial"))
# There is a statistically significant relationship between 'albumin' and 'CLASS,' after accounting for the 'steroids' variable.
# B0: -6.7060 (the value of Y when X=0)
# B1: exp(2.16): 8.67: An increase of one unit in 'albumin' multiplies the odds of death by 8.67 times, adjusting for steroid use.
14. Display a correlogram showing the correlation between all continuous variables and indicate which variables have a correlation greater than 0.6.
library(corrplot)
C<-cor(epatite2[,c(13:17)])
corrplot(C, method="number")