10/02, 2022
First steps
Brown | Blue | Hazel | Green | |
---|---|---|---|---|
Black | 32 | 11 | 10 | 3 |
Brown | 53 | 50 | 25 | 15 |
Red | 10 | 10 | 7 | 7 |
Blond | 3 | 30 | 5 | 8 |
Hair | Eye | Sex | Freq |
---|---|---|---|
Black | Brown | Male | 32 |
Brown | Brown | Male | 53 |
Red | Brown | Male | 10 |
Blond | Brown | Male | 3 |
Black | Blue | Male | 11 |
Brown | Blue | Male | 50 |
Red | Blue | Male | 10 |
Blond | Blue | Male | 30 |
Black | Hazel | Male | 10 |
Brown | Hazel | Male | 25 |
Red | Hazel | Male | 7 |
Blond | Hazel | Male | 5 |
Black | Green | Male | 3 |
Brown | Green | Male | 15 |
Red | Green | Male | 7 |
Blond | Green | Male | 8 |
Black | Brown | Female | 36 |
Brown | Brown | Female | 66 |
Red | Brown | Female | 16 |
Blond | Brown | Female | 4 |
Black | Blue | Female | 9 |
Brown | Blue | Female | 34 |
Red | Blue | Female | 7 |
Blond | Blue | Female | 64 |
Black | Hazel | Female | 5 |
Brown | Hazel | Female | 29 |
Red | Hazel | Female | 7 |
Blond | Hazel | Female | 5 |
Black | Green | Female | 2 |
Brown | Green | Female | 14 |
Red | Green | Female | 7 |
Blond | Green | Female | 8 |
A package with a few very powerfull functions to wrangle data
Part of tidyverse
group_by (group data)
summarize
filter (Find rows with certain conditions)
select together with starts_with, ends_with or contains
mutate (Generates new variables)
%>% pipeline
arrange sorts
library(tidyverse) Summary.Petal <- summarize(iris, Mean.Petal.Length = mean(Petal.Length), SD.Petal.Length = sd(Petal.Length))
Mean.Petal.Length | SD.Petal.Length |
---|---|
3.758 | 1.765298 |
Summary.Petal <- group_by(iris, Species) Summary.Petal <- summarize(Summary.Petal, Mean.Petal.Length = mean(Petal.Length), SD.Petal.Length = sd(Petal.Length))
Species | Mean.Petal.Length | SD.Petal.Length |
---|---|---|
setosa | 1.462 | 0.1736640 |
versicolor | 4.260 | 0.4699110 |
virginica | 5.552 | 0.5518947 |
data("mtcars") Mtcars2 <- group_by(mtcars, am, cyl) Consumo <- summarize(Mtcars2, Average_MPG = mean(mpg), desv = sd(mpg))
am | cyl | Average_MPG | desv |
---|---|---|---|
0 | 4 | 22.90000 | 1.4525839 |
0 | 6 | 19.12500 | 1.6317169 |
0 | 8 | 15.05000 | 2.7743959 |
1 | 4 | 28.07500 | 4.4838599 |
1 | 6 | 20.56667 | 0.7505553 |
1 | 8 | 15.40000 | 0.5656854 |
DF <- mutate(iris, Petal.Sepal.Ratio = Petal.Length/Sepal.Length)
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | Petal.Sepal.Ratio |
---|---|---|---|---|---|
5.8 | 4.0 | 1.2 | 0.2 | setosa | 0.21 |
4.7 | 3.2 | 1.6 | 0.2 | setosa | 0.34 |
5.1 | 3.8 | 1.9 | 0.4 | setosa | 0.37 |
5.2 | 2.7 | 3.9 | 1.4 | versicolor | 0.75 |
6.4 | 2.9 | 4.3 | 1.3 | versicolor | 0.67 |
5.5 | 2.5 | 4.0 | 1.3 | versicolor | 0.73 |
6.5 | 3.0 | 5.8 | 2.2 | virginica | 0.89 |
6.0 | 2.2 | 5.0 | 1.5 | virginica | 0.83 |
6.1 | 2.6 | 5.6 | 1.4 | virginica | 0.92 |
5.9 | 3.0 | 5.1 | 1.8 | virginica | 0.86 |
x <- c(1, 4, 6, 8) y <- round(mean(sqrt(log(x))), 2)
x <- c(1, 4, 6, 8) y <- x %>% log() %>% sqrt() %>% mean() %>% round(2)
## [1] 0.99
DF <- mutate(iris, Petal.Sepal.Ratio = Petal.Length/Sepal.Length) BySpecies <- group_by(DF, Species) Summary.Byspecies <- summarize(BySpecies, MEAN = mean(Petal.Sepal.Ratio), SD = sd(Petal.Sepal.Ratio))
Species | MEAN | SD |
---|---|---|
setosa | 0.2927557 | 0.0347958 |
versicolor | 0.7177285 | 0.0536255 |
virginica | 0.8437495 | 0.0438064 |
Summary.Byspecies <- summarize(group_by(mutate(iris, Petal.Sepal.Ratio = Petal.Length/Sepal.Length), Species), MEAN = mean(Petal.Sepal.Ratio), SD = sd(Petal.Sepal.Ratio))
Species | MEAN | SD |
---|---|---|
setosa | 0.2927557 | 0.0347958 |
versicolor | 0.7177285 | 0.0536255 |
virginica | 0.8437495 | 0.0438064 |
library(tidyverse) MEAN <- iris %>% group_by(Species) %>% summarize_all(.funs = list(Mean = mean, SD = sd))
Species | Sepal.Length_Mean | Sepal.Width_Mean | Petal.Length_Mean | Petal.Width_Mean | Sepal.Length_SD | Sepal.Width_SD | Petal.Length_SD | Petal.Width_SD |
---|---|---|---|---|---|---|---|---|
setosa | 5.006 | 3.428 | 1.462 | 0.246 | 0.3524897 | 0.3790644 | 0.1736640 | 0.1053856 |
versicolor | 5.936 | 2.770 | 4.260 | 1.326 | 0.5161711 | 0.3137983 | 0.4699110 | 0.1977527 |
virginica | 6.588 | 2.974 | 5.552 | 2.026 | 0.6358796 | 0.3224966 | 0.5518947 | 0.2746501 |
Symbol | Meaning | simbolo_cont | significado_cont |
---|---|---|---|
> | Greater than | != | other than |
< | Less than | %in% | within the group |
== | Equal to | is.na | is NA |
>= | Greater than or equal to | !is.na | is not NA |
<= | Less than or equal to | | & | or, and |
data("iris") DF <- iris %>% filter(Species != "versicolor") %>% group_by(Species) %>% summarise_all(mean)
Species | Sepal.Length | Sepal.Width | Petal.Length | Petal.Width |
---|---|---|---|---|
setosa | 5.006 | 3.428 | 1.462 | 0.246 |
virginica | 6.588 | 2.974 | 5.552 | 2.026 |
DF <- iris %>% filter(Petal.Length >= 4 & Sepal.Length >= 5) %>% group_by(Species) %>% summarise(N = n())
Species | N |
---|---|
versicolor | 39 |
virginica | 49 |
data("iris") DF <- iris %>% filter(Species != "versicolor") %>% group_by(Species) %>% summarise_all(.funs = list(Mean = mean, SD = sd))
Species | Sepal.Length_Mean | Sepal.Width_Mean | Petal.Length_Mean | Petal.Width_Mean | Sepal.Length_SD | Sepal.Width_SD | Petal.Length_SD | Petal.Width_SD |
---|---|---|---|---|---|---|---|---|
setosa | 5.006 | 3.428 | 1.462 | 0.246 | 0.3524897 | 0.3790644 | 0.1736640 | 0.1053856 |
virginica | 6.588 | 2.974 | 5.552 | 2.026 | 0.6358796 | 0.3224966 | 0.5518947 | 0.2746501 |
iris %>% group_by(Species) %>% select(Petal.Length, Petal.Width) %>% summarize_all(mean)
iris %>% group_by(Species) %>% select(-Sepal.Length, -Sepal.Width) %>% summarize_all(mean)
iris %>% group_by(Species) %>% select(contains("Petal")) %>% summarize_all(mean)
iris %>% group_by(Species) %>% select(-contains("Sepal")) %>% summarize_all(mean)
Species | Petal.Length | Petal.Width |
---|---|---|
setosa | 1.462 | 0.246 |
versicolor | 4.260 | 1.326 |
virginica | 5.552 | 2.026 |
Active_Cases <- read_csv("https://raw.githubusercontent.com/MinCiencia/Datos-COVID19/master/output/producto19/CasosActivosPorComuna_std.csv")
Using the repository database of the ministry of science of chile, generate a dataframe that answers the following: