We are going to be working with the iris dataset (classic) and using dplyr TO BEND THE DATASET TO OUR WILL AND REFORGE IT IN OUR IMAGE
## first things first load up the libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# and bring in the dataset
data(iris)
#shorten it to make a copy and also to make it easier to type
ir <- iris
dim(ir)
## [1] 150 5
#looks like 150 observations and 5 variables
iris1 <- ir %>% filter(Species %in% c("virginica","versicolor") & Sepal.Length > 6 & Sepal.Width > 2.5)
#check the structure
dim(iris1)
## [1] 56 5
#looks like we got a 56 5, 56 observations and 5 variables pawtner
iris2 <- select(iris1, Species, Sepal.Length, Sepal.Width)
# check the dims
dim(iris2)
## [1] 56 3
#we knocked out two columns so it makes sense that we have 56 observations still but only 3 variables
iris3 <- arrange(iris2,desc(Sepal.Length))
# check the first six rows
head(iris3)
## Species Sepal.Length Sepal.Width
## 1 virginica 7.9 3.8
## 2 virginica 7.7 3.8
## 3 virginica 7.7 2.6
## 4 virginica 7.7 2.8
## 5 virginica 7.7 3.0
## 6 virginica 7.6 3.0
# somebody call prof. x because we're about to make a mutant!
iris4 <- iris3 %>% mutate(Sepal.Area=(Sepal.Length * Sepal.Width))
dim(iris4)
## [1] 56 4
#with mutate we added an additional column but did not change the original number of observations thus we have 56 observations and 4 variables (one with that new variable smell)
# whats that smell? It smells like SUMMARIZE TIME!!
# we will summarize to get the means and while we're here let's do the other stats too. Why not? Whose going to stop me? I fear neither god nor man.
iris5 <- iris4 %>% summarize(avg.Sepal.Length=mean(Sepal.Length), avg.Sepal.Width=mean(Sepal.Width),sd.Sepal.Length=sd(Sepal.Length), sd.Sepal.Width=sd(Sepal.Width),var.Sepal.Length=var(Sepal.Length), var.Sepal.Width=var(Sepal.Width),obs.count=n())
print(iris5)
## avg.Sepal.Length avg.Sepal.Width sd.Sepal.Length sd.Sepal.Width
## 1 6.698214 3.041071 0.4863561 0.2535399
## var.Sepal.Length var.Sepal.Width obs.count
## 1 0.2365422 0.06428247 56
# here we are once again, we're summarizing, can't deny it, can't pretend. we're doing it by species this time!
# why do one stat when three will do!? we'll also do the variance and standard deviation. FOR COMPLETENESS!
iris6 <- iris4 %>% group_by(Species) %>% summarize(avg.Sepal.Length=mean(Sepal.Length), avg.Sepal.Width=mean(Sepal.Width),sd.Sepal.Length=sd(Sepal.Length), sd.Sepal.Width=sd(Sepal.Width),var.Sepal.Length=var(Sepal.Length), var.Sepal.Width=var(Sepal.Width),obs.count=n())
# interesting, our dataset is a pretty virginica heavy, that must have something to do with our selection criteria.
# I was using pipes from the get go! They're so handy!
# but I am a heathen and don't believe in using the new line. one. big. line.
irisFinal <- ir %>% filter(Species %in% c("virginica","versicolor") & Sepal.Length > 6 & Sepal.Width > 2.5) %>% select(Species, Sepal.Length, Sepal.Width) %>% arrange(desc(Sepal.Length))%>% mutate(Sepal.Area=(Sepal.Length * Sepal.Width))%>% group_by(Species) %>% summarize(avg.Sepal.Length=mean(Sepal.Length), avg.Sepal.Width=mean(Sepal.Width),sd.Sepal.Length=sd(Sepal.Length), sd.Sepal.Width=sd(Sepal.Width),var.Sepal.Length=var(Sepal.Length), var.Sepal.Width=var(Sepal.Width),obs.count=n())
ir.long <- ir %>% pivot_longer(cols = Sepal.Length:Petal.Width,names_to = "Measure",values_to = "Value")
What’s that you say? you want to see a graph of the data? Well since you asked so nicely!
theme_set(theme_bw())
ggplot(data = ir.long,aes(x = Species,y=Value,fill=Measure))+geom_bar(stat = 'identity',position = 'dodge')+scale_fill_manual(values = c("#219ebc","#023047","#ffb703","#fb8500"))