Running the example data

For this first part we will mostly be doing an example workflow

# first load libraries

library(ggplot2)
library(MASS)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
theme_set(theme_bw(base_size = 16))

bring in our data vector

# from what I gather we are making an object z which is composed of 3000 random numbers

z <- rnorm(n=3000,mean=0.2)
z <- data.frame(1:3000,z)
names(z) <- list("ID","myVar")
z <- z[z$myVar>0,]
str(z)
## 'data.frame':    1731 obs. of  2 variables:
##  $ ID   : int  1 2 3 4 5 7 8 9 10 11 ...
##  $ myVar: num  0.4181 0.0216 0.2764 1.1487 0.1693 ...
summary(z$myVar)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000813 0.353326 0.738302 0.863336 1.233888 3.621359

for distributions the best thing we can do is visualize it. we’ll do that with a general histogram but lets make it cute

# the original color was cornsilk we will not be doing that because it is ugly. 

p1 <- ggplot(data=z, aes(x=myVar, y=..density..)) +
  geom_histogram(color="black",fill="orchid",size=0.2) 
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(p1)
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# now that's a hot graph!

# now we can add a density curve so we can see the distribution

p1 <-  p1 +  geom_density(linetype="dotted",size=0.75)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

so we have the shape of our distribution now we can move on to getting the maximum likelihood parameters of the normal

normPars <- fitdistr(z$myVar,"normal")
print(normPars)
##       mean          sd    
##   0.86333572   0.63483577 
##  (0.01525854) (0.01078941)
str(normPars)
## List of 5
##  $ estimate: Named num [1:2] 0.863 0.635
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ sd      : Named num [1:2] 0.0153 0.0108
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ vcov    : num [1:2, 1:2] 0.000233 0 0 0.000116
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "mean" "sd"
##   .. ..$ : chr [1:2] "mean" "sd"
##  $ n       : int 1731
##  $ loglik  : num -1670
##  - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute
##      mean 
## 0.8633357

Now we can overlay what is a normal probability density with our data

meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]

xval <- seq(0,max(z$myVar),len=length(z$myVar))

 stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="Dark Cyan", n = length(z$myVar), args = list(mean = meanML, sd = sdML))
 p1 + stat
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

looks like we got a bit of a skew!

We’ll run through each distribution to see how it fits

Exponential

expoPars <- fitdistr(z$myVar,"exponential")
rateML <- expoPars$estimate["rate"]

stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="gold", n = length(z$myVar), args = list(rate=rateML))
 p1 + stat + stat2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Uniform

stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$myVar), args = list(min=min(z$myVar), max=max(z$myVar)))
 p1 + stat + stat2 + stat3
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

uniform definitely ain’t it

Gamma

gammaPars <- fitdistr(z$myVar,"gamma")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]

stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="Tomato", n = length(z$myVar), args = list(shape=shapeML, rate=rateML))
 p1 + stat + stat2 + stat3 + stat4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Beta

pSpecial <- ggplot(data=z, aes(x=myVar/(max(myVar + 0.1)), y=..density..)) +
  geom_histogram(color="grey60",fill="dodgerblue",size=0.2) + 
  xlim(c(0,1)) +
  geom_density(size=0.75,linetype="dotted")

betaPars <- fitdistr(x=z$myVar/max(z$myVar + 0.1),start=list(shape1=1,shape2=2),"beta")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]

statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$myVar), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

Using real data!

The dataset that I will be using will be the physical characteristics of the first 8 generations of pokemon. We have height and weight data for 905 pokemon but we’ll just use the height

#we'll load up the dataset

z <- read.table(file = "../Datasets/archive/pokemon_data_pokeapi.csv",header=TRUE,sep=",")
str(z)
## 'data.frame':    905 obs. of  10 variables:
##  $ Name            : chr  "Bulbasaur" "Ivysaur" "Venusaur" "Charmander" ...
##  $ Pokedex.Number  : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Type1           : chr  "Grass" "Grass" "Grass" "Fire" ...
##  $ Type2           : chr  "Poison" "Poison" "Poison" "" ...
##  $ Classification  : chr  "Seed Pokémon" "Seed Pokémon" "Seed Pokémon" "Lizard Pokémon" ...
##  $ Height..m.      : num  0.7 1 2 0.6 1.1 1.7 0.5 1 1.6 0.3 ...
##  $ Weight..kg.     : num  6.9 13 100 8.5 19 90.5 9 22.5 85.5 2.9 ...
##  $ Abilities       : chr  "Overgrow, Chlorophyll" "Overgrow, Chlorophyll" "Overgrow, Chlorophyll" "Blaze, Solar-power" ...
##  $ Generation      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Legendary.Status: chr  "No" "No" "No" "No" ...
summary(z)
##      Name           Pokedex.Number    Type1              Type2          
##  Length:905         Min.   :  1    Length:905         Length:905        
##  Class :character   1st Qu.:227    Class :character   Class :character  
##  Mode  :character   Median :453    Mode  :character   Mode  :character  
##                     Mean   :453                                         
##                     3rd Qu.:679                                         
##                     Max.   :905                                         
##  Classification       Height..m.      Weight..kg.      Abilities        
##  Length:905         Min.   : 0.100   Min.   :  0.10   Length:905        
##  Class :character   1st Qu.: 0.500   1st Qu.:  8.50   Class :character  
##  Mode  :character   Median : 1.000   Median : 28.00   Mode  :character  
##                     Mean   : 1.193   Mean   : 64.29                     
##                     3rd Qu.: 1.500   3rd Qu.: 65.50                     
##                     Max.   :20.000   Max.   :999.90                     
##    Generation    Legendary.Status  
##  Min.   :1.000   Length:905        
##  1st Qu.:2.000   Class :character  
##  Median :4.000   Mode  :character  
##  Mean   :4.177                     
##  3rd Qu.:6.000                     
##  Max.   :8.000
# as cool as it is we really only need the numerical columns so we'll select those 

z <- z %>% select(Name,Pokedex.Number,Height..m.,Weight..kg.)

#we'll change the names so that the height is the myvar so we can run it

colnames(z) <- c("Name","dex.entry","myVar","weight")


summary(z$myVar)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.100   0.500   1.000   1.193   1.500  20.000

From here we’ll run through the code the same way as before but with real data….well real data about fictional creatures.

p1 <- ggplot(data=z, aes(x=myVar, y=..density..)) +
  geom_histogram(color="black",fill="orchid",size=0.2) 
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p1 <-  p1 +  geom_density(linetype="dotted",size=0.75)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

oooooo dang that is one hell of a skew. so keep in mind that myVar is the height in meters so this is skewed hard to the left and cannot be negative

normPars <- fitdistr(z$myVar,"normal")
print(normPars)
##       mean          sd    
##   1.19270718   1.23207226 
##  (0.04095547) (0.02895989)
str(normPars)
## List of 5
##  $ estimate: Named num [1:2] 1.19 1.23
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ sd      : Named num [1:2] 0.041 0.029
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ vcov    : num [1:2, 1:2] 0.001677 0 0 0.000839
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "mean" "sd"
##   .. ..$ : chr [1:2] "mean" "sd"
##  $ n       : int 905
##  $ loglik  : num -1473
##  - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute
##     mean 
## 1.192707

lets proceed going through each of the distributions

Normal

meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]

xval <- seq(0,max(z$myVar),len=length(z$myVar))

 stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="Dark Cyan", n = length(z$myVar), args = list(mean = meanML, sd = sdML))
 p1 + stat
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Exponential

expoPars <- fitdistr(z$myVar,"exponential")
rateML <- expoPars$estimate["rate"]

stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="gold", n = length(z$myVar), args = list(rate=rateML))
 p1 + stat + stat2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

### Uniform

stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$myVar), args = list(min=min(z$myVar), max=max(z$myVar)))
 p1 + stat + stat2 + stat3
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Gamma

gammaPars <- fitdistr(z$myVar,"gamma")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]

stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="Tomato", n = length(z$myVar), args = list(shape=shapeML, rate=rateML))
 p1 + stat + stat2 + stat3 + stat4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Beta

pSpecial <- ggplot(data=z, aes(x=myVar/(max(myVar + 0.1)), y=..density..)) +
  geom_histogram(color="grey60",fill="dodgerblue",size=0.2) + 
  xlim(c(0,1)) +
  geom_density(size=0.75,linetype="dotted")

betaPars <- fitdistr(x=z$myVar/max(z$myVar + 0.1),start=list(shape1=1,shape2=2),"beta")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]

statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$myVar), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

Analysis and Discussion

As a general overview of the data the shape is that of a left skew with a long right tail meaning that the data are dense on the left with low numbers of higher values. Contextually this means that the majority of pokemon are in the same height range (0-3) meters while >5 meters is represented by individuals or small amounts of pokemon. This is probably pretty close to real world depictions of animals within the same species (I suppose technically pokemon are all the same species since they can interbreed somehow….) where the majority is centered around a mean size with large members being sparsely represented. Looking at the different distributions the gamma and beta distributions seem to fit the best. I would go as far to say that the gamma distribution fits the most accurately although the beta is very close. While exponential captures the tail very well it greatly overestimates the low end and the normal distribution underestimates the mean values. As we would expect, the uniform is flatout wrong but it would be inappropriate to use in this context because the probability of a pokemon height is not equal from pokemon to pokemon.

Lastly we’ll create a mock dataset with the same mean and length and see what that looks like

normPars <- fitdistr(z$myVar,"gamma")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
print(normPars)
##      shape         rate   
##   1.94521299   1.63092264 
##  (0.08477272) (0.08101067)
str(normPars)
## List of 5
##  $ estimate: Named num [1:2] 1.95 1.63
##   ..- attr(*, "names")= chr [1:2] "shape" "rate"
##  $ sd      : Named num [1:2] 0.0848 0.081
##   ..- attr(*, "names")= chr [1:2] "shape" "rate"
##  $ vcov    : num [1:2, 1:2] 0.00719 0.00603 0.00603 0.00656
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "shape" "rate"
##   .. ..$ : chr [1:2] "shape" "rate"
##  $ loglik  : num -967
##  $ n       : int 905
##  - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute
## <NA> 
##   NA
y <- rnorm(n=905,mean=1.193)
y <- data.frame(1:905,y)
names(y) <- list("ID","myVar")
y <- y[y$myVar>0,]
str(y)
## 'data.frame':    807 obs. of  2 variables:
##  $ ID   : int  2 3 4 5 6 7 8 10 11 12 ...
##  $ myVar: num  1.178 0.912 0.603 1.655 0.856 ...
summary(y$myVar)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000747 0.775480 1.360841 1.421387 1.970951 4.540795
p2 <- ggplot(data=y, aes(x=myVar, y=..density..)) +
  geom_histogram(color="black",fill="orchid",size=0.2) 
print(p2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p2 <-  p2 +  geom_density(linetype="dotted",size=0.75)
print(p2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

completely different! This one actually looks like a normal distribution though it does have a small amount of tail on the right, it doesn’t capture the reality that the tail extends to a large number