#I load the HDI Table from github
HDIOriginal="https://github.com/Fundamentals-Sarah/HW5Repo2/raw/refs/heads/main/HDR25_Statistical_Annex_HDI_Table.xlsx"
#I import rio package for this file.
HDIOriginal = rio::import(file = HDIOriginal)
## New names:
## • `` -> `...1`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
## • `` -> `...14`
## • `` -> `...15`
#I review the unclean data to see what I can immediately clean up in terms of columns and rows.
HDIOriginal
#I start by limiting the data frame to only those rows I want to keep.
HDICleaner=HDIOriginal[4:203,]
#I look at the first few rows to make sure my change applied right.
head(HDICleaner)
#I keep only those columns with information.
HDICleaner <- HDICleaner[, c(1,2,3,5,7,9,11,13,15)]
#I check my work to ensure the columns look right.
head(HDICleaner)
#I rename my columns so they take up the first row only and check that this now looks right
names(HDICleaner)[1]<- "HDI_Rank"
names(HDICleaner)[2]<- "Country"
names(HDICleaner)[3]<- "HDI_Value_2023"
names(HDICleaner)[4]<- "Life_Expectancy"
names(HDICleaner)[5]<- "Expected_Yrs_Schooling_2023"
names(HDICleaner)[6]<- "Mean_Schooling_Yrs_2023"
names(HDICleaner)[7]<- "Gross_GNI_Per_Capita"
names(HDICleaner)[8]<- "GNI_Per_Capita_Minus_HDI_Rank"
names(HDICleaner)[9]<- "HDI_Rank_2022"
str(HDICleaner)
## 'data.frame': 200 obs. of 9 variables:
## $ HDI_Rank : chr NA "HDI rank" NA NA ...
## $ Country : chr NA "Country" NA "Very high human development" ...
## $ HDI_Value_2023 : chr "Human Development Index (HDI)" "Value" "2023" NA ...
## $ Life_Expectancy : chr "Life expectancy at birth" "(years)" "2023" NA ...
## $ Expected_Yrs_Schooling_2023 : chr "Expected years of schooling" "(years)" "2023" NA ...
## $ Mean_Schooling_Yrs_2023 : chr "Mean years of schooling" "(years)" "2023" NA ...
## $ Gross_GNI_Per_Capita : chr "Gross national income (GNI) per capita" "(2021 PPP $)" "2023" NA ...
## $ GNI_Per_Capita_Minus_HDI_Rank: chr "GNI per capita rank minus HDI rank" NA "2023" NA ...
## $ HDI_Rank_2022 : chr "HDI rank" NA "2022" NA ...
#I can delete the previous rows that were holding column name information and HDI category, now that I've cleaned up column names and I don't need the HDI category.
HDIClean <- HDICleaner[-c(1:4, 79, 130, 174), ]
#I take another look at how my data now looks.
str(HDIClean)
## 'data.frame': 193 obs. of 9 variables:
## $ HDI_Rank : chr "1" "2" "2" "4" ...
## $ Country : chr "Iceland" "Norway" "Switzerland" "Denmark" ...
## $ HDI_Value_2023 : chr "0.97199999999999998" "0.97" "0.97" "0.96199999999999997" ...
## $ Life_Expectancy : chr "82.691000000000003" "83.308000000000007" "83.953999999999994" "81.933000000000007" ...
## $ Expected_Yrs_Schooling_2023 : chr "18.850589750000001" "18.792850489999999" "16.667530060000001" "18.704010010000001" ...
## $ Mean_Schooling_Yrs_2023 : chr "13.908926279999999" "13.117962179999999" "13.94912109" "13.027320599999999" ...
## $ Gross_GNI_Per_Capita : chr "69116.937359999996" "112710.0211" "81948.901769999997" "76007.856690000001" ...
## $ GNI_Per_Capita_Minus_HDI_Rank: chr "12" "0" "5" "4" ...
## $ HDI_Rank_2022 : chr "3" "1" "2" "4" ...
#I want to examine means of certain variables, and I see a lot of the variables that should be numeric are being read in R as character. I'll need to change them from character to numeric. I first address any potential white spaces.
HDIClean$HDI_Rank <- trimws(HDIClean$HDI_Rank)
HDIClean$HDI_Value_2023 <- trimws(HDIClean$HDI_Value_2023)
HDIClean$Life_Expectancy <- trimws(HDIClean$Life_Expectancy)
HDIClean$Expected_Yrs_Schooling_2023 <- trimws(HDIClean$Expected_Yrs_Schooling_2023)
HDIClean$Mean_Schooling_Yrs_2023 <- trimws(HDIClean$Mean_Schooling_Yrs_2023)
HDIClean$Gross_GNI_Per_Capita <- trimws(HDIClean$Gross_GNI_Per_Capita)
HDIClean$GNI_Per_Capita_Minus_HDI_Rank <- trimws(HDIClean$GNI_Per_Capita_Minus_HDI_Rank)
HDIClean$HDI_Rank_2022 <- trimws(HDIClean$HDI_Rank_2022)
#I make the columns of data that should be numeric, numeric.
HDIClean$HDI_Rank <- as.integer(HDIClean$HDI_Rank)
HDIClean$HDI_Value_2023<- as.numeric(HDIClean$HDI_Value_2023)
HDIClean$Life_Expectancy <- as.numeric(HDIClean$Life_Expectancy)
HDIClean$Expected_Yrs_Schooling_2023 <- as.numeric(HDIClean$Expected_Yrs_Schooling_2023)
HDIClean$Mean_Schooling_Yrs_2023 <- as.numeric(HDIClean$Mean_Schooling_Yrs_2023)
HDIClean$Gross_GNI_Per_Capita <- as.numeric(HDIClean$Gross_GNI_Per_Capita)
HDIClean$GNI_Per_Capita_Minus_HDI_Rank <- as.integer(HDIClean$GNI_Per_Capita_Minus_HDI_Rank)
HDIClean$HDI_Rank_2022 <- as.integer(HDIClean$HDI_Rank_2022)
## Warning: NAs introduced by coercion
#I find the mean of Life Expectancy across Countries is 73.11.
summary(HDIClean$Life_Expectancy)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 54.46 67.39 73.49 73.11 78.34 85.71
#I find the mean of Expected Years of Schooling across Countries is 13.585.
summary(HDIClean$Expected_Yrs_Schooling_2023)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.635 11.505 13.336 13.585 15.888 20.846
#I find the mean of Mean School Years across Countries is 9.173.
summary(HDIClean$Mean_Schooling_Yrs_2023)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.412 6.780 9.933 9.173 11.642 14.296
#I find the mean of Gross GNI Per Capita across Countries is $24,620.70.
summary(HDIClean$Gross_GNI_Per_Capita)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 688.3 5746.6 15866.5 24620.7 36793.0 166811.7
#I save as an RDS file
saveRDS(HDIClean,"HDIClean.RDS")
#I name my file and ensure it is pulled in correctly in the RDS format
HDICleanRDS=readRDS("HDIClean.RDS")
str(HDICleanRDS)
## 'data.frame': 193 obs. of 9 variables:
## $ HDI_Rank : int 1 2 2 4 5 5 7 8 8 10 ...
## $ Country : chr "Iceland" "Norway" "Switzerland" "Denmark" ...
## $ HDI_Value_2023 : num 0.972 0.97 0.97 0.962 0.959 0.959 0.958 0.955 0.955 0.951 ...
## $ Life_Expectancy : num 82.7 83.3 84 81.9 81.4 ...
## $ Expected_Yrs_Schooling_2023 : num 18.9 18.8 16.7 18.7 17.3 ...
## $ Mean_Schooling_Yrs_2023 : num 13.9 13.1 13.9 13 14.3 ...
## $ Gross_GNI_Per_Capita : num 69117 112710 81949 76008 64053 ...
## $ GNI_Per_Capita_Minus_HDI_Rank: int 12 0 5 4 13 10 14 4 6 9 ...
## $ HDI_Rank_2022 : int 3 1 2 4 6 4 8 9 7 13 ...
#I save my cleaned file as a csv
write.csv(HDIClean,"HDIClean.csv", row.names=FALSE)
HDICleanCSV=read.csv("HDIClean.csv")
#I check that the cleaned csv can be read in correctly
str(HDICleanCSV)
## 'data.frame': 193 obs. of 9 variables:
## $ HDI_Rank : int 1 2 2 4 5 5 7 8 8 10 ...
## $ Country : chr "Iceland" "Norway" "Switzerland" "Denmark" ...
## $ HDI_Value_2023 : num 0.972 0.97 0.97 0.962 0.959 0.959 0.958 0.955 0.955 0.951 ...
## $ Life_Expectancy : num 82.7 83.3 84 81.9 81.4 ...
## $ Expected_Yrs_Schooling_2023 : num 18.9 18.8 16.7 18.7 17.3 ...
## $ Mean_Schooling_Yrs_2023 : num 13.9 13.1 13.9 13 14.3 ...
## $ Gross_GNI_Per_Capita : num 69117 112710 81949 76008 64053 ...
## $ GNI_Per_Capita_Minus_HDI_Rank: int 12 0 5 4 13 10 14 4 6 9 ...
## $ HDI_Rank_2022 : int 3 1 2 4 6 4 8 9 7 13 ...