Import the "CENTENARI e BIOCHIMICA" dataset, which can be found at the following link: https://urly.it/38h1.
How many subjects and variables are there in the dataset?
library(readxl)
esercitazione <- read_excel("CENTENARI_BIOCHIMICA.xls", col_names=TRUE)
dim(esercitazione)
Remove samples with missing data and save them in a new dataset called DATI.
View(esercitazione)
DATI <- na.omit(esercitazione)
Calculate the measures of position and dispersion for the age variable and the absolute and relative frequency distributions for the group variable.
summary(DATI$age)
t = table(DATI$age); t[t == max(t)]
sd(DATI$age); var(DATI$age);
range(DATI$age)
IQR(DATI$age)
cv <- function(x) {
m <- mean(x)
sd <- sd(x)
R <- (sd / m) * 100 return(R) }
cv(DATI$age)
table(DATI$Gruppo);
prop.table(table(DATI$Gruppo))
Reclassify the categorical variables that R erroneously interpreted as numeric.
str(DATI)
DATI$Gruppo <- as.factor(DATI$Gruppo)
DATI$uo <- as.factor(DATI$uo) # operational unit DATI$FUMO <- as.factor(DATI$FUMO)
DATI$INFARTO <- as.factor(DATI$INFARTO)
DATI$INSUFF_RENE <- as.factor(DATI$INSUFF_RENE)
DATI$DIABETE <- as.factor(DATI$DIABETE)
DATI$TEST_1_DIABETE <- as.factor(DATI$TEST_1_DIABETE)
DATI$TEST_2_DIABETE <- as.factor(DATI$TEST_2_DIABETE)
Change the variable names “uo” to “unità_operativa” and “age” to “Età”.
names(DATI)
names(DATI)[2] <- "unità_operativa"
names(DATI)[4] <- "Età" names(DATI) # Alternatively, all at once names(DATI)[c(2, 4)] <- c("unità_operativa", "Età") names(DATI)
Create a vector called “età_crescente” containing the values sorted in ascending order of the “Età” variable.
età_crescente <- sort(DATI$Età)
Create a vector called “COL_TOT_decrescente” containing the values sorted in descending order of the COL_TOT variable.
COL_TOT_decrescente <- sort(DATI$COL_TOT, decreasing = TRUE)
What are the levels of the “Gruppo” variable?
levels(DATI$Gruppo)
Change the reference level of the “Gruppo” variable, setting "CTRL" (control) as the reference.
# First way
DATI$Gruppo <- relevel(DATI$Gruppo, "CTRL") levels(DATI$Gruppo)
# Second way
DATI$Gruppo <- ordered(DATI$Gruppo, levels = c("CTRL", "CENT", "FIGLI")) levels(DATI$Gruppo)
# Third way
DATI$Gruppo <- factor(v, levels = c("CTRL", "CENT", "FIGLI")) levels(b)
How many centenarians have a BMI < 25?
dim(subset(DATI, DATI$Gruppo == "CENT" & DATI$BMI < 25))
# Alternatively
table(DATI$Gruppo, DATI$BMI < 25)
Create a new variable in the dataset called “fumo_ricodificato” that will equal “Fumatore” for FUMO = 1 and “Non_fumatore” for FUMO = 0.
table(DATI$FUMO)
DATI$fumo_ricodificato <- ifelse(DATI$FUMO == "1", "Fumatore", "Non_fumatore")
table(DATI$fumo_ricodificato)
Change the levels of the DIABETE variable, replacing 0 with "NO" and 1 with "SI".
# First way
levels(DATI$DIABETE)[levels(DATI$DIABETE) == "0"] <- "NO"
levels(DATI$DIABETE)[levels(DATI$DIABETE) == "1"] <- "SI" levels(DATI$DIABETE)
# Second way
levels(DATI$DIABETE) <- c("NO", "SI")
Create a new variable (separate from the dataset) called glicemia_Ricod that will be equal to: “Ipoglicemia” when the blood glucose value is below 60, “normale” when it is between 60 and 100 (inclusive), and “Iperglicemia” when above 100.
Glicemia_Ricod <- ifelse(DATI$GLICEMIA < 60, "ipoglicemia", ifelse(DATI$GLICEMIA >= 60 & DATI$GLICEMIA <= 100, "normale", "iperglicemia"))
table(Glicemia_Ricod)
How many hypoglycemic, normal, and hyperglycemic subjects are there?
table(Glicemia_Ricod)
Extract the albumin values only for the controls and create a data.frame with these values only.
dati_ridotti <- as.data.frame(subset(DATI, Gruppo == "CTRL")[, 20])
# Alternatively, in parts
dati_ridotti <- subset(DATI, Gruppo == "CTRL")
dati_ridotti <- dati_ridotti[, 20]
Calculate the mean and median for the Creatinine and Urea variables divided by Group.
aggregate(DATI[, 7:8], list(DATI$Gruppo), mean)
aggregate(DATI[, 4:5], list(DATI$Gruppo), median)
Display with a graph if the distribution of the GLICEMIA variable is different between Diabetics and non-Diabetics.
boxplot(DATI$GLICEMIA ~ DATI$DIABETE)
# With ggplot
p <- ggplot(DATI, aes(x = DIABETE, y = GLICEMIA)) + geom_boxplot()
p p1 <- ggplot(DATI, aes(x = DIABETE, y = GLICEMIA)) + geom_boxplot(outlier.colour = "red", outlier.shape = 8, outlier.size = 4)
p1 p2 <- ggplot(DATI, aes(x = DIABETE, y = GLICEMIA, color = Gruppo, fill = "red")) + geom_boxplot(outlier.colour = "red", outlier.shape = 8, outlier.size = 4) p2
Display with a graph the percentages of subjects belonging to the various operational units.
percentuali <- round(prop.table(table(DATI$unità_operativa)) * 100)
labels <- paste(levels(DATI$unità_operativa), percentuali, "%", sep = "_")
pie(table(DATI$unità_operativa, label = as.factor(labels)))
# With ggplot
FREQ <- as.data.frame(table(DATI$unità_operativa))
FREQ <- cbind(FREQ, labels)
pie <- ggplot(FREQ, aes(x = "", y = Freq, fill = labels)) + geom_bar(stat = "identity", width = 1)
pie <- pie + coord_polar("y", start = 0) + geom_text(aes(label = labels), position = position_stack(vjust = 0.5))
pie <- pie + labs(x = NULL, y = NULL, fill = NULL)
pie <- pie + theme_classic() + theme(axis.line = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
plot.title = element_text(hjust = 0.5, color = "#666666"))
Display with a graph the relationship between GLICEMIA and HOMA-IR.
scatter.smooth(DATI$GLICEMIA, DATI$`HOMA-IR`)
plot(DATI$GLICEMIA, DATI$`HOMA-IR`)
# With ggplot
p1 <- ggplot(DATI, aes(x = DATI$GLICEMIA, y = `HOMA-IR`)) + geom_point(size = 3) + geom_smooth(method = 'lm')
p2 <- ggplot(DATI, aes(x = DATI$GLICEMIA, y = `HOMA-IR`, colour = Gruppo)) + geom_point(size = 3) + geom_smooth(method = 'lm')
Save the produced graphs in a PDF.
Is the number of subjects who had a heart attack significantly different between smokers and non-smokers? Choose the most appropriate test and indicate the p-value.
chisq.test(table(DATI$INFARTO, DATI$FUMO))
Is there a difference in total cholesterol values (COL_TOT) between subjects with and without kidney blockage (INSUFFICIENZA_RENE)? Choose the test you find appropriate and interpret the result.
# Checking normality and homoscedasticity using the following tests:
shapiro.test(DATI$COL_TOT)
bartlett.test(DATI$COL_TOT ~ DATI$INSUFF_RENE)
# We have a binary categorical variable and a continuous variable, so we apply the t-test. t.test(DATI$COL_TOT ~ DATI$INSUFF_RENE, paired = FALSE) # No significant difference exists in total cholesterol values
Choose the most appropriate plot to display the distribution of the variable GLICEMIA.
hist(DATI$GLICEMIA)
plot(density(DATI$GLICEMIA))
# with ggplot ggplot(data = DATI, aes(DATI$GLICEMIA)) + geom_histogram()
Create a barplot showing the mean blood glucose levels among the subject groups CENT, FIGLI, and CTRL, and add a legend.
barplot(tapply(DATI$GLICEMIA, DATI$Gruppo, mean), col = c("purple", "green3", "cyan"), main = "Barplot", ylim = c(0, 120)) legend("topright", c("CTRL", "CENT", "FIGLIC"), cex = 0.8, fill = c("purple", "green3", "cyan"))
# with ggplot
p <- ggplot(DATI, aes(x = Gruppo, y = GLICEMIA, fill = DATI$Gruppo)) + geom_bar(stat = "identity") p
Do diabetic individuals have higher blood glucose values than non-diabetic individuals?
# Applying the Shapiro test to check for normality of the glucose variable.
shapiro.test(DATI$GLICEMIA)
wilcox.test(DATI$GLICEMIA ~ DATI$DIABETE, paired = FALSE)
Are cholesterol values different across operating units?
shapiro.test(DATI$COL_TOT)
bartlett.test(DATI$COL_TOT ~ DATI$unità_operativa)
aov(DATI$COL_TOT ~ DATI$unità_operativa)
summary(aov(DATI$COL_TOT ~ DATI$unità_operativa))
# To evaluate which groups have significant differences:
pairwise.t.test(DATI$COL_TOT, DATI$unità_operativa, p.adj = "bonf")
# Units 3 and 5 are significantly different from units 1 and 2.
Is there a correlation between COL_TOT and BMI, and between Hematocrit and Red Blood Cells?
cor.test(DATI$BMI, DATI$COL_TOT)
cor.test(DATI$Ematocrito, DATI$Eritrociti)
Investigate the correlations present in the dataset graphically.
library(corrplot)
M <- cor(DATI[, 4:30])
corrplot(M, method = "circle")
How many smokers are there among the groups?
table(DATI$FUMO, DATI$Gruppo)
Calculate the mean age variable in the CTRL group.
tapply(DATI$Età, DATI$Gruppo, mean)
# or
DATI_CTRL <- subset(DATI, DATI$Gruppo == "CTRL")
mean(DATI_CTRL$Età)
Is there a relationship between heart attack and kidney failure?
table(DATI$INFARTO, DATI$INSUFF_RENE)
fisher.test(table(DATI$INFARTO, DATI$INSUFF_RENE))
Studies have been conducted on immune abnormalities in autistic children; the data on serum concentration levels (units/ml) of an antigen are reported for three groups of children under the age of 10. Are there differences among the three groups: autistic, normal, and developmentally delayed?
Autistici <- c(755, 383, 380, 215, 400, 343, 415, 360, 345, 450, 410, 435, 460, 360, 225, 900, 365, 440, 820, 400, 170, 300, 325, 345, 230, 370, 285, 315, 195, 270, 305, 375, 220) normal <- c(165, 390, 290, 435, 235, 320, 330, 205, 375, 345, 305, 220, 270, 355, 360, 335, 305, 360, 335, 305, 325, 245, 285, 370, 345) ritardati <- c(380, 510, 315, 565, 715, 380, 390, 245, 155, 335, 295, 200, 105, 105, 245) # Create a vector with all serum concentration values. concentrazionesierica <- c(autistici, normal, ritardati) length(concentrazionesierica) # Create a vector containing the categories corresponding to serum concentration values. GRUPPI <- c(rep("AUT", length(autistici)), rep("NORM", length(normal)), rep("RIT", length(ritardati))) GRUPPI <- as.factor(GRUPPI) # Check for normality: shapiro.test(concentrazionesierica) # Since we do not have a normal distribution, we apply a non-parametric test, specifically the Kruskal-Wallis test (the non-parametric equivalent of the ANOVA test). # Set the continuous variable as Y and the categorical variable as X. kruskal.test(concentrazionesierica ~ GRUPPI)
The dietitian Mario Smilzo claims that his diet leads to rapid weight loss. To prove it, he takes a group of 10 people and weighs them before and after the diet. Their weight before and after the diet is:
Investigate whether the diet resulted in a significant weight reduction.
PESO1 <- c(80, 90, 96, 85, 102, 105, 99, 89, 98, 78, 75, 79, 98, 82, 95, 102, 95, 80, 97, 76)
prepost <- c(rep("pre", 10), rep("post", 10))
shapiro.test(PESO1)
bartlett.test(PESO1, prepost)
t.test(PESO1 ~ prepost, paired = TRUE, alternative = "less")
A study aims to evaluate whether an anti-cancer drug is capable of reducing tumor size. The drug is administered to 3 groups of animals: an untreated group (NO_F), a group treated with the study drug (F_SPE), and a group treated with the traditional drug (F_TRA). The results related to the dry weight of the biopsy in mg are shown in the table.
# Comparison among groups with ANOVA
TRATTAMENTI <- c(NO_F, F_SPE, F_TRA)
Gruppi <- c(rep("NO_F", length(NO_F)), rep("F_SPE", length(F_SPE)), rep("F_TRA", length(F_TRA)))
shapiro.test(TRATTAMENTI)
bartlett.test(TRATTAMENTI, Gruppi)
model <- aov(TRATTAMENTI ~ Gruppi)
summary(model)
TukeyHSD(model)
In a hospital, it is hypothesized that living conditions and stress may influence the response to a particular therapy. If so, many therapy outcomes could be improved by addressing living conditions. Therefore, a correlation between quality of life (QOL), evaluated through questionnaire scores (from 1 to 100), and response to therapy (R2T), evaluated with a clinical improvement score (from 1 to 10), was investigated on a sample of 10 subjects:
Verify if there is a correlation.
# The data are ordinal categorical
QOL <- c(10, 90, 20, 99, 35, 55, 46, 80, 75, 66)
R2T <- c(1, 8, 2, 9, 3, 5, 4, 7, 7, 6)
cor.test(QOL, R2T, method = "kendall")