Import the file "dati_SM.xls" into R, which can be found on the "Materiale Didattico" page.
setwd(“folder path”) # Change the directory
# The directory can also be changed via the dropdown menu (Session -> Set working directory -> Choose directory)
getwd() # to check which directory we are in
library(readxl) # The read_excel function is contained in the readxl package, which may need to be installed.
esercitazione <- read_excel (“dati_SM.xls”, col_names = TRUE)
View (esercitazione) # view the imported dataset
attach(esercitazione)
2. Get the size of the dataset
dim(esercitazione)
3. View the structure of the dataset to get information about the nature of the variables
str(esercitazione)
4. Convert the variables "Sesso", "SM", "Disturbi_sensibilita", and "Disturbi_visivi" in the dataset to factors.
# You can do this one variable at a time
Sesso <- as.factor(Sesso)
Disturbi_sensibilita <- as.factor(Disturbi_sensibilita)
Disturbi_visivi <- as.factor(Disturbi_visivi)
# Or all together
colnames(esercitazione)
esercitazione[, c(2, 3, 6, 7)] = lapply(esercitazione[, c(2, 3, 6, 7)], function(x) as.factor(x))
str(esercitazione)
5. Change the names of the variables "SM" to "Sclerosi_Multipla" and "Soggetto" to "Identificativo_Soggetto".
# You can do this one variable at a time
names(esercitazione)
names(esercitazione)[1] <- “Identificativo_Soggetto”
names(esercitazione)[3] <- “Sclerosi_Multipla”
names(esercitazione)
# Or all together
names(esercitazione)[c(1, 3)] <- c(“Identificativo_Soggetto”, “Sclerosi_Multipla”)
names(esercitazione)
6. Sort the values of the variable "Eta" in ascending order
sort(Eta)
7. Sort the values of the variable "VitaminaD" in descending order
sort(VitaminaD, decreasing = TRUE)
8. What are the levels of the "Sesso" variable?
levels(Sesso)
9. Change the reference level of the "Sesso" variable, setting "Maschio" as the reference.
# First way
Sesso <- relevel(Sesso, "Maschio")
levels(Sesso)
# Second way
Sesso <- ordered(Sesso, levels = c("Maschio", "Femmina"))
levels(Sesso)
# Third way
Sesso <- factor(Sesso, levels = c("Maschio", "Femmina"))
levels(Sesso)
10. Create a new variable in the dataset called "sesso_ricodificato" which will be equal to 1 for all females and equal to 2 for all males.
esercitazione$sesso_ricodificato <- ifelse(Sesso == "Femmina", 1, 2)
# By using esercitazione$sesso_ricodificato, the new variable will be directly added to the dataset.
11. Determine the class of the newly created variable
class(esercitazione$sesso_ricodificato)
# You should always use esercitazione$sesso_ricodificato and not just the variable name because we haven’t used attach on the new dataset, so you can’t access the variable directly by its name.
12. Change the levels of the variable "Disturbi_sensibilita", replacing "no" with "NO" and "si" with "SI".
# First way
levels(Disturbi_sensibilita)[levels(Disturbi_sensibilita) == "no"] <- "NO"
levels(Disturbi_sensibilita)[levels(Disturbi_sensibilita) == "si"] <- "SI"
levels(Disturbi_sensibilita)
# Second way
levels(Disturbi_sensibilita) <- c("NO", "SI")
13. Create a new variable called "Eta_Ricod" that will be: "Adolescenti" when the age is between 11-19 years (inclusive), "Adulti" when the age is between 20-64 years (inclusive), and "Anziani" when the age is greater than 65 years.
Eta_Ricod <- ifelse(Eta >= 11 & Eta <= 19, "giovane", ifelse(Eta >= 20 & Eta <= 64, "adulto", ifelse(Eta >= 65, "anziano", "NA")))
Eta_Ricod
14. Calculate the absolute frequency distribution of the newly created "Eta_Ricod" variable
table(Eta_Ricod)
15. Extract the vitamin values only when the sex is female and create a data frame with only these values
dati_ridotti1 <- as.data.frame(subset(esercitazione, Sesso == "Femmina")[, 4])
dati_ridotti1
16. Create another reduced data frame for subjects who have a vitamin D level equal to 30
dati_ridotti2 <- as.data.frame(subset(esercitazione, VitaminaD == 30))
dati_ridotti2
17. Calculate the mean and median for the variables "VitaminaD" and "Eta", divided by sex
aggregate(esercitazione[, 4:5], list(Sesso), mean)
aggregate(esercitazione[, 4:5], list(Sesso), median)
18. Calculate the mean of all numeric variables in the dataset
sapply(esercitazione, mean, na.rm = TRUE)
19. Calculate the mean of the "VitaminaD" variable, rounding the result to two decimal places
round(mean(VitaminaD), 2)