R Notebook

#I load the HDI Table from github
HDIOriginal="https://github.com/Fundamentals-Sarah/HW5Repo2/raw/refs/heads/main/HDR25_Statistical_Annex_HDI_Table.xlsx"

#I import rio package for this file.
HDIOriginal = rio::import(file = HDIOriginal)

## New names:
## • `` -> `...1`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
## • `` -> `...14`
## • `` -> `...15`

#I review the unclean data to see what I can immediately clean up in terms of columns and rows.
HDIOriginal

#I start by limiting the data frame to only those rows I want to keep.
HDICleaner=HDIOriginal[4:203,]

#I look at the first few rows to make sure my change applied right.
head(HDICleaner)

#I keep only those columns with information.
HDICleaner <- HDICleaner[, c(1,2,3,5,7,9,11,13,15)]

#I check my work to ensure the columns look right.
head(HDICleaner)

#I rename my columns so they take up the first row only and check that this now looks right
names(HDICleaner)[1]<- "HDI_Rank"
names(HDICleaner)[2]<- "Country"
names(HDICleaner)[3]<- "HDI_Value_2023"
names(HDICleaner)[4]<- "Life_Expectancy"
names(HDICleaner)[5]<- "Expected_Yrs_Schooling_2023"
names(HDICleaner)[6]<- "Mean_Schooling_Yrs_2023"
names(HDICleaner)[7]<- "Gross_GNI_Per_Capita"
names(HDICleaner)[8]<- "GNI_Per_Capita_Minus_HDI_Rank"
names(HDICleaner)[9]<- "HDI_Rank_2022"
str(HDICleaner)

## 'data.frame':    200 obs. of  9 variables:
##  $ HDI_Rank                     : chr  NA "HDI rank" NA NA ...
##  $ Country                      : chr  NA "Country" NA "Very high human development" ...
##  $ HDI_Value_2023               : chr  "Human Development Index (HDI)" "Value" "2023" NA ...
##  $ Life_Expectancy              : chr  "Life expectancy at birth" "(years)" "2023" NA ...
##  $ Expected_Yrs_Schooling_2023  : chr  "Expected years of schooling" "(years)" "2023" NA ...
##  $ Mean_Schooling_Yrs_2023      : chr  "Mean years of schooling" "(years)" "2023" NA ...
##  $ Gross_GNI_Per_Capita         : chr  "Gross national income (GNI) per capita" "(2021 PPP $)" "2023" NA ...
##  $ GNI_Per_Capita_Minus_HDI_Rank: chr  "GNI per capita rank minus HDI rank" NA "2023" NA ...
##  $ HDI_Rank_2022                : chr  "HDI rank" NA "2022" NA ...

#I can delete the previous rows that were holding column name information and HDI category, now that I've cleaned up column names and I don't need the HDI category.
HDIClean <- HDICleaner[-c(1:4, 79, 130, 174), ]

#I take another look at how my data now looks.
str(HDIClean)

## 'data.frame':    193 obs. of  9 variables:
##  $ HDI_Rank                     : chr  "1" "2" "2" "4" ...
##  $ Country                      : chr  "Iceland" "Norway" "Switzerland" "Denmark" ...
##  $ HDI_Value_2023               : chr  "0.97199999999999998" "0.97" "0.97" "0.96199999999999997" ...
##  $ Life_Expectancy              : chr  "82.691000000000003" "83.308000000000007" "83.953999999999994" "81.933000000000007" ...
##  $ Expected_Yrs_Schooling_2023  : chr  "18.850589750000001" "18.792850489999999" "16.667530060000001" "18.704010010000001" ...
##  $ Mean_Schooling_Yrs_2023      : chr  "13.908926279999999" "13.117962179999999" "13.94912109" "13.027320599999999" ...
##  $ Gross_GNI_Per_Capita         : chr  "69116.937359999996" "112710.0211" "81948.901769999997" "76007.856690000001" ...
##  $ GNI_Per_Capita_Minus_HDI_Rank: chr  "12" "0" "5" "4" ...
##  $ HDI_Rank_2022                : chr  "3" "1" "2" "4" ...

#I want to examine means of certain variables, and I see a lot of the variables that should be numeric are being read in R as character. I'll need to change them from character to numeric. I first address any potential white spaces.
HDIClean$HDI_Rank <- trimws(HDIClean$HDI_Rank)
HDIClean$HDI_Value_2023 <- trimws(HDIClean$HDI_Value_2023)
HDIClean$Life_Expectancy <- trimws(HDIClean$Life_Expectancy)
HDIClean$Expected_Yrs_Schooling_2023 <- trimws(HDIClean$Expected_Yrs_Schooling_2023)
HDIClean$Mean_Schooling_Yrs_2023 <- trimws(HDIClean$Mean_Schooling_Yrs_2023)
HDIClean$Gross_GNI_Per_Capita <- trimws(HDIClean$Gross_GNI_Per_Capita)
HDIClean$GNI_Per_Capita_Minus_HDI_Rank <- trimws(HDIClean$GNI_Per_Capita_Minus_HDI_Rank)
HDIClean$HDI_Rank_2022 <- trimws(HDIClean$HDI_Rank_2022)

#I make the columns of data that should be numeric, numeric.
HDIClean$HDI_Rank <- as.integer(HDIClean$HDI_Rank)
HDIClean$HDI_Value_2023<- as.numeric(HDIClean$HDI_Value_2023)
HDIClean$Life_Expectancy <- as.numeric(HDIClean$Life_Expectancy)
HDIClean$Expected_Yrs_Schooling_2023 <- as.numeric(HDIClean$Expected_Yrs_Schooling_2023)
HDIClean$Mean_Schooling_Yrs_2023 <- as.numeric(HDIClean$Mean_Schooling_Yrs_2023)
HDIClean$Gross_GNI_Per_Capita <- as.numeric(HDIClean$Gross_GNI_Per_Capita)
HDIClean$GNI_Per_Capita_Minus_HDI_Rank <- as.integer(HDIClean$GNI_Per_Capita_Minus_HDI_Rank)
HDIClean$HDI_Rank_2022 <- as.integer(HDIClean$HDI_Rank_2022)

## Warning: NAs introduced by coercion

#I find the mean of Life Expectancy across Countries is 73.11. 
summary(HDIClean$Life_Expectancy)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   54.46   67.39   73.49   73.11   78.34   85.71

#I find the mean of Expected Years of Schooling across Countries is 13.585.
summary(HDIClean$Expected_Yrs_Schooling_2023)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   5.635  11.505  13.336  13.585  15.888  20.846

#I find the mean of Mean School Years across Countries is 9.173. 
summary(HDIClean$Mean_Schooling_Yrs_2023)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.412   6.780   9.933   9.173  11.642  14.296

#I find the mean of Gross GNI Per Capita across Countries is $24,620.70. 
summary(HDIClean$Gross_GNI_Per_Capita)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    688.3   5746.6  15866.5  24620.7  36793.0 166811.7

#I save as an RDS file
saveRDS(HDIClean,"HDIClean.RDS")

#I name my file and ensure it is pulled in correctly in the RDS format
HDICleanRDS=readRDS("HDIClean.RDS")
str(HDICleanRDS)

## 'data.frame':    193 obs. of  9 variables:
##  $ HDI_Rank                     : int  1 2 2 4 5 5 7 8 8 10 ...
##  $ Country                      : chr  "Iceland" "Norway" "Switzerland" "Denmark" ...
##  $ HDI_Value_2023               : num  0.972 0.97 0.97 0.962 0.959 0.959 0.958 0.955 0.955 0.951 ...
##  $ Life_Expectancy              : num  82.7 83.3 84 81.9 81.4 ...
##  $ Expected_Yrs_Schooling_2023  : num  18.9 18.8 16.7 18.7 17.3 ...
##  $ Mean_Schooling_Yrs_2023      : num  13.9 13.1 13.9 13 14.3 ...
##  $ Gross_GNI_Per_Capita         : num  69117 112710 81949 76008 64053 ...
##  $ GNI_Per_Capita_Minus_HDI_Rank: int  12 0 5 4 13 10 14 4 6 9 ...
##  $ HDI_Rank_2022                : int  3 1 2 4 6 4 8 9 7 13 ...

#I save my cleaned file as a csv
write.csv(HDIClean,"HDIClean.csv", row.names=FALSE)
HDICleanCSV=read.csv("HDIClean.csv")

#I check that the cleaned csv can be read in correctly
str(HDICleanCSV)

## 'data.frame':    193 obs. of  9 variables:
##  $ HDI_Rank                     : int  1 2 2 4 5 5 7 8 8 10 ...
##  $ Country                      : chr  "Iceland" "Norway" "Switzerland" "Denmark" ...
##  $ HDI_Value_2023               : num  0.972 0.97 0.97 0.962 0.959 0.959 0.958 0.955 0.955 0.951 ...
##  $ Life_Expectancy              : num  82.7 83.3 84 81.9 81.4 ...
##  $ Expected_Yrs_Schooling_2023  : num  18.9 18.8 16.7 18.7 17.3 ...
##  $ Mean_Schooling_Yrs_2023      : num  13.9 13.1 13.9 13 14.3 ...
##  $ Gross_GNI_Per_Capita         : num  69117 112710 81949 76008 64053 ...
##  $ GNI_Per_Capita_Minus_HDI_Rank: int  12 0 5 4 13 10 14 4 6 9 ...
##  $ HDI_Rank_2022                : int  3 1 2 4 6 4 8 9 7 13 ...