For this first part we will mostly be doing an example workflow
# first load libraries
library(ggplot2)
library(MASS)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
theme_set(theme_bw(base_size = 16))
bring in our data vector
# from what I gather we are making an object z which is composed of 3000 random numbers
z <- rnorm(n=3000,mean=0.2)
z <- data.frame(1:3000,z)
names(z) <- list("ID","myVar")
z <- z[z$myVar>0,]
str(z)
## 'data.frame': 1731 obs. of 2 variables:
## $ ID : int 1 2 3 4 5 7 8 9 10 11 ...
## $ myVar: num 0.4181 0.0216 0.2764 1.1487 0.1693 ...
summary(z$myVar)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000813 0.353326 0.738302 0.863336 1.233888 3.621359
for distributions the best thing we can do is visualize it. we’ll do that with a general histogram but lets make it cute
# the original color was cornsilk we will not be doing that because it is ugly.
p1 <- ggplot(data=z, aes(x=myVar, y=..density..)) +
geom_histogram(color="black",fill="orchid",size=0.2)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(p1)
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# now that's a hot graph!
# now we can add a density curve so we can see the distribution
p1 <- p1 + geom_density(linetype="dotted",size=0.75)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
so we have the shape of our distribution now we can move on to getting the maximum likelihood parameters of the normal
normPars <- fitdistr(z$myVar,"normal")
print(normPars)
## mean sd
## 0.86333572 0.63483577
## (0.01525854) (0.01078941)
str(normPars)
## List of 5
## $ estimate: Named num [1:2] 0.863 0.635
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 0.0153 0.0108
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 0.000233 0 0 0.000116
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 1731
## $ loglik : num -1670
## - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute
## mean
## 0.8633357
Now we can overlay what is a normal probability density with our data
meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]
xval <- seq(0,max(z$myVar),len=length(z$myVar))
stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="Dark Cyan", n = length(z$myVar), args = list(mean = meanML, sd = sdML))
p1 + stat
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
looks like we got a bit of a skew!
We’ll run through each distribution to see how it fits
Exponential
expoPars <- fitdistr(z$myVar,"exponential")
rateML <- expoPars$estimate["rate"]
stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="gold", n = length(z$myVar), args = list(rate=rateML))
p1 + stat + stat2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Uniform
stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$myVar), args = list(min=min(z$myVar), max=max(z$myVar)))
p1 + stat + stat2 + stat3
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
uniform definitely ain’t it
Gamma
gammaPars <- fitdistr(z$myVar,"gamma")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]
stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="Tomato", n = length(z$myVar), args = list(shape=shapeML, rate=rateML))
p1 + stat + stat2 + stat3 + stat4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Beta
pSpecial <- ggplot(data=z, aes(x=myVar/(max(myVar + 0.1)), y=..density..)) +
geom_histogram(color="grey60",fill="dodgerblue",size=0.2) +
xlim(c(0,1)) +
geom_density(size=0.75,linetype="dotted")
betaPars <- fitdistr(x=z$myVar/max(z$myVar + 0.1),start=list(shape1=1,shape2=2),"beta")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]
statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$myVar), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
The dataset that I will be using will be the physical characteristics of the first 8 generations of pokemon. We have height and weight data for 905 pokemon but we’ll just use the height
#we'll load up the dataset
z <- read.table(file = "../Datasets/archive/pokemon_data_pokeapi.csv",header=TRUE,sep=",")
str(z)
## 'data.frame': 905 obs. of 10 variables:
## $ Name : chr "Bulbasaur" "Ivysaur" "Venusaur" "Charmander" ...
## $ Pokedex.Number : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Type1 : chr "Grass" "Grass" "Grass" "Fire" ...
## $ Type2 : chr "Poison" "Poison" "Poison" "" ...
## $ Classification : chr "Seed Pokémon" "Seed Pokémon" "Seed Pokémon" "Lizard Pokémon" ...
## $ Height..m. : num 0.7 1 2 0.6 1.1 1.7 0.5 1 1.6 0.3 ...
## $ Weight..kg. : num 6.9 13 100 8.5 19 90.5 9 22.5 85.5 2.9 ...
## $ Abilities : chr "Overgrow, Chlorophyll" "Overgrow, Chlorophyll" "Overgrow, Chlorophyll" "Blaze, Solar-power" ...
## $ Generation : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Legendary.Status: chr "No" "No" "No" "No" ...
summary(z)
## Name Pokedex.Number Type1 Type2
## Length:905 Min. : 1 Length:905 Length:905
## Class :character 1st Qu.:227 Class :character Class :character
## Mode :character Median :453 Mode :character Mode :character
## Mean :453
## 3rd Qu.:679
## Max. :905
## Classification Height..m. Weight..kg. Abilities
## Length:905 Min. : 0.100 Min. : 0.10 Length:905
## Class :character 1st Qu.: 0.500 1st Qu.: 8.50 Class :character
## Mode :character Median : 1.000 Median : 28.00 Mode :character
## Mean : 1.193 Mean : 64.29
## 3rd Qu.: 1.500 3rd Qu.: 65.50
## Max. :20.000 Max. :999.90
## Generation Legendary.Status
## Min. :1.000 Length:905
## 1st Qu.:2.000 Class :character
## Median :4.000 Mode :character
## Mean :4.177
## 3rd Qu.:6.000
## Max. :8.000
# as cool as it is we really only need the numerical columns so we'll select those
z <- z %>% select(Name,Pokedex.Number,Height..m.,Weight..kg.)
#we'll change the names so that the height is the myvar so we can run it
colnames(z) <- c("Name","dex.entry","myVar","weight")
summary(z$myVar)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.100 0.500 1.000 1.193 1.500 20.000
From here we’ll run through the code the same way as before but with real data….well real data about fictional creatures.
p1 <- ggplot(data=z, aes(x=myVar, y=..density..)) +
geom_histogram(color="black",fill="orchid",size=0.2)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p1 <- p1 + geom_density(linetype="dotted",size=0.75)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
oooooo dang that is one hell of a skew. so keep in mind that myVar is the height in meters so this is skewed hard to the left and cannot be negative
normPars <- fitdistr(z$myVar,"normal")
print(normPars)
## mean sd
## 1.19270718 1.23207226
## (0.04095547) (0.02895989)
str(normPars)
## List of 5
## $ estimate: Named num [1:2] 1.19 1.23
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 0.041 0.029
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 0.001677 0 0 0.000839
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 905
## $ loglik : num -1473
## - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute
## mean
## 1.192707
lets proceed going through each of the distributions
meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]
xval <- seq(0,max(z$myVar),len=length(z$myVar))
stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="Dark Cyan", n = length(z$myVar), args = list(mean = meanML, sd = sdML))
p1 + stat
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
expoPars <- fitdistr(z$myVar,"exponential")
rateML <- expoPars$estimate["rate"]
stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="gold", n = length(z$myVar), args = list(rate=rateML))
p1 + stat + stat2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
### Uniform
stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$myVar), args = list(min=min(z$myVar), max=max(z$myVar)))
p1 + stat + stat2 + stat3
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
gammaPars <- fitdistr(z$myVar,"gamma")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]
stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="Tomato", n = length(z$myVar), args = list(shape=shapeML, rate=rateML))
p1 + stat + stat2 + stat3 + stat4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Beta
pSpecial <- ggplot(data=z, aes(x=myVar/(max(myVar + 0.1)), y=..density..)) +
geom_histogram(color="grey60",fill="dodgerblue",size=0.2) +
xlim(c(0,1)) +
geom_density(size=0.75,linetype="dotted")
betaPars <- fitdistr(x=z$myVar/max(z$myVar + 0.1),start=list(shape1=1,shape2=2),"beta")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]
statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$myVar), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
As a general overview of the data the shape is that of a left skew with a long right tail meaning that the data are dense on the left with low numbers of higher values. Contextually this means that the majority of pokemon are in the same height range (0-3) meters while >5 meters is represented by individuals or small amounts of pokemon. This is probably pretty close to real world depictions of animals within the same species (I suppose technically pokemon are all the same species since they can interbreed somehow….) where the majority is centered around a mean size with large members being sparsely represented. Looking at the different distributions the gamma and beta distributions seem to fit the best. I would go as far to say that the gamma distribution fits the most accurately although the beta is very close. While exponential captures the tail very well it greatly overestimates the low end and the normal distribution underestimates the mean values. As we would expect, the uniform is flatout wrong but it would be inappropriate to use in this context because the probability of a pokemon height is not equal from pokemon to pokemon.
Lastly we’ll create a mock dataset with the same mean and length and see what that looks like
normPars <- fitdistr(z$myVar,"gamma")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
print(normPars)
## shape rate
## 1.94521299 1.63092264
## (0.08477272) (0.08101067)
str(normPars)
## List of 5
## $ estimate: Named num [1:2] 1.95 1.63
## ..- attr(*, "names")= chr [1:2] "shape" "rate"
## $ sd : Named num [1:2] 0.0848 0.081
## ..- attr(*, "names")= chr [1:2] "shape" "rate"
## $ vcov : num [1:2, 1:2] 0.00719 0.00603 0.00603 0.00656
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "shape" "rate"
## .. ..$ : chr [1:2] "shape" "rate"
## $ loglik : num -967
## $ n : int 905
## - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute
## <NA>
## NA
y <- rnorm(n=905,mean=1.193)
y <- data.frame(1:905,y)
names(y) <- list("ID","myVar")
y <- y[y$myVar>0,]
str(y)
## 'data.frame': 807 obs. of 2 variables:
## $ ID : int 2 3 4 5 6 7 8 10 11 12 ...
## $ myVar: num 1.178 0.912 0.603 1.655 0.856 ...
summary(y$myVar)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000747 0.775480 1.360841 1.421387 1.970951 4.540795
p2 <- ggplot(data=y, aes(x=myVar, y=..density..)) +
geom_histogram(color="black",fill="orchid",size=0.2)
print(p2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p2 <- p2 + geom_density(linetype="dotted",size=0.75)
print(p2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
completely different! This one actually looks like a normal distribution though it does have a small amount of tail on the right, it doesn’t capture the reality that the tail extends to a large number