tidyverse is a set of packages:
dplyr
, for data manipulationggplot2
, for data visualisationtidyr
, for data tidyingreadr
, for data importstringr
, for working with stringsforcats
, for working with factorspurrr
, for functional programmingtibble
, for tibbles, a modern re-imagining of data framesInstall tidyverse
package using install.packages("tidyverse")
.
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.8
## ✔ tidyr 0.8.2 ✔ stringr 1.3.1
## ✔ readr 1.2.1 ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
theme_set(theme_bw()) # set bright theme
head(iris)
as_tibble(iris)
tibble(id = 1:12,
letters = month.name)
readr
library(readr) # included in tidyverse
df <- read_csv("https://goo.gl/v7nvho")
head(df)
df <- read_tsv("https://goo.gl/33r2Ut")
head(df)
df <- read_delim("https://goo.gl/33r2Ut", delim = "\t")
head(df)
There is also a package readxl
for reading files from .xls and .xlsx files (even from different sheets).
dplyr
homo <- read_csv("https://raw.githubusercontent.com/LingData2019/LingData/master/data/orientation.csv")
homo
The majority of examples in that presentation are based on Hau 2007. Experiment consisted of a perception and judgment test aimed at measuring the correlation between acoustic cues and perceived sexual orientation. Naïve Cantonese speakers were asked to listen to the Cantonese speech samples collected in Experiment and judge whether the speakers were gay or heterosexual. There are 14 speakers and following parameters:
dplyr::filter()
How many speakers are older than 28?
homo %>%
filter(age > 28, s.duration.ms < 60)
The %>%
operators pipe their left-hand side values forward into expressions that appear on the right-hand side, i.e. one can replace f(x) with x %>% f().
sort(sqrt(abs(sin(1:22))), decreasing = TRUE)
## [1] 0.9999951 0.9952926 0.9946649 0.9805088 0.9792468 0.9554817 0.9535709
## [8] 0.9173173 0.9146888 0.8699440 0.8665952 0.8105471 0.8064043 0.7375779
## [15] 0.7325114 0.6482029 0.6419646 0.5365662 0.5285977 0.3871398 0.3756594
## [22] 0.0940814
1:22 %>%
sin() %>%
abs() %>%
sqrt() %>%
sort(., decreasing = TRUE) # dot here stands for an argument where pipe get obtained data
## [1] 0.9999951 0.9952926 0.9946649 0.9805088 0.9792468 0.9554817 0.9535709
## [8] 0.9173173 0.9146888 0.8699440 0.8665952 0.8105471 0.8064043 0.7375779
## [15] 0.7325114 0.6482029 0.6419646 0.5365662 0.5285977 0.3871398 0.3756594
## [22] 0.0940814
All folowing tasks will be performed with dataset diamonds
from ggplot2
. This dataset containing the prices and other attributes of almost 54 000 diamonds.
How many diamonds in the diamonds
dataset have depth
greater than 65 and price greater or equal than 15 000?
dplyr::slice()
homo %>%
slice(3:7)
How many diamonds have the cheapest price in the observation range from 2000 to 8000?
dplyr::select()
homo %>%
select(8:10)
homo %>%
select(speaker:average.f0.Hz)
homo %>%
select(-speaker)
homo %>%
select(-c(speaker, perceived.as.hetero, perceived.as.homo, perceived.as.homo.percent))
homo %>%
select(speaker, age, s.duration.ms)
dplyr::arrange()
homo %>%
arrange(orientation, desc(age))
dplyr::distinct()
homo %>%
distinct(orientation)
homo %>%
distinct(orientation, age > 20)
homo %>%
count(orientation)
homo %>%
count(orientation, age > 20)
How many diamonds with Fair
and Good
cut
have color
G
?
dplyr::mutate()
homo %>%
mutate(f0.mn = average.f0.Hz - f0.range.Hz/2,
f0.mx = (average.f0.Hz + f0.range.Hz/2))
dplyr::group_by(...) %>% summarise(...)
homo %>%
summarise(min(age), mean(s.duration.ms))
homo %>%
group_by(orientation) %>%
summarise(my_mean = mean(s.duration.ms))
homo %>%
group_by(orientation) %>%
summarise(mean(s.duration.ms))
homo %>%
group_by(orientation) %>%
summarise(mean_by_orientation = mean(s.duration.ms))
If you need to count number of group members, it is posible to use function n()
in summarise()
or count()
function if you don’t need any other statistics.
homo %>%
group_by(orientation, age > 20) %>%
summarise(my_mean = mean(s.duration.ms), n_observations = n())
homo %>%
count(orientation, age > 20)
If you want to add aditional column instead of get summary by each group it is possible to use mutate()
with group_by()
:
homo %>%
group_by(orientation) %>%
mutate(mean_by_orientation = mean(s.duration.ms))
Calculate the mean value and standard deviation of price
variable for each color. In the form fill the values for color with lowest mean value
dplyr::.._join()
languages <- tibble(
languages = c("Selkup", "French", "Chukchi", "Kashubian"),
countries = c("Russia", "France", "Russia", "Poland"),
iso = c("sel", "fra", "ckt", "pol")
)
languages
country_population <- tibble(
countries = c("Russia", "Poland", "Finland"),
population_mln = c(143, 38, 5))
country_population
inner_join(languages, country_population)
left_join(languages, country_population)
right_join(languages, country_population)
anti_join(languages, country_population)
anti_join(country_population, languages)
full_join(country_population, languages)
There is a nice trick that groups together calculated statistics with source data.frame. Just use .._join()
:
homo %>%
group_by(orientation, age > 20) %>%
summarise(my_mean = mean(s.duration.ms), n_observations = n())
homo %>%
group_by(orientation, age > 20) %>%
summarise(my_mean = mean(s.duration.ms), n_observations = n()) %>%
left_join(homo)
df.short <- data.frame(
consonant = c("stops", "fricatives", "affricates", "nasals"),
initial = c(123, 87, 73, 7),
intervocal = c(57, 77, 82, 78),
final = c(30, 69, 12, 104))
df.short
tidyr::gather()
df.short <- data.frame(
consonant = c("stops", "fricatives", "affricates", "nasals"),
initial = c(123, 87, 73, 7),
intervocal = c(57, 77, 82, 78),
final = c(30, 69, 12, 104))
df.short
df.short %>%
gather(position, number, initial:final) ->
df.long
df.long
tidyr::spread()
df.long %>%
spread(position, number) ->
df.short
df.short
Calculate mean price
value for all combinations of color
and cut
variables and create a table in a short format. What is the value of the first column and the first row?
In Anscombe, F. J. (1973). “Graphs in Statistical Analysis” was presented the next sets of data:
quartet <- read.csv("https://raw.githubusercontent.com/LingData2019/LingData/master/data/anscombe.s.quartet.csv")
quartet
quartet %>%
group_by(dataset) %>%
summarise(mean_X = mean(x),
mean_Y = mean(y),
sd_X = sd(x),
sd_Y = sd(y),
cor = cor(x, y),
n_obs = n()) %>%
select(-dataset) %>%
round(., 2)
In Matejka and Fitzmaurice (2017) “Same Stats, Different Graphs” was presented the next sets of data:
datasaurus <- read_tsv("https://raw.githubusercontent.com/LingData2019/LingData/master/data/datasaurus.tsv")
head(datasaurus)
datasaurus %>%
group_by(dataset) %>%
summarise(mean_X = mean(x),
mean_Y = mean(y),
sd_X = sd(x),
sd_Y = sd(y),
cor = cor(x, y),
n_obs = n()) %>%
select(-dataset) %>%
round(., 1)
ggplot(data = homo, aes(s.duration.ms, vowel.duration.ms)) +
geom_point()
homo %>%
ggplot(aes(average.f0.Hz, age))+
geom_smooth(method = "lm")+
geom_point(aes(color = orientation))
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms,
color = orientation)) +
geom_point()
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms,
shape = orientation)) +
geom_point(color = "green")
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms,
size = age)) +
geom_point()
homo %>%
mutate(label = ifelse(orientation == "homo","⚣", "⚤")) %>%
ggplot(aes(s.duration.ms, vowel.duration.ms, label = label, fill = orientation)) +
geom_label()
homo %>%
mutate(label = ifelse(orientation == "homo","⚣", "⚤")) %>%
ggplot(aes(s.duration.ms, vowel.duration.ms, label = label, color = orientation)) +
geom_text()
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms)) +
geom_point()+
labs(title = "length of [s] vs. length of vowels",
subtitle = "based on 14 speakers of Cantonese",
caption = "data from [Hau 2007]",
x = "duration of [s] in ms",
y = "vowel duration in ms")
Lets use the frequency dictionary for Russian
freq <- read_csv("https://raw.githubusercontent.com/LingData2019/LingData/master/data/freqrnc2011_1000.csv")
freq %>%
ggplot(aes(rank, freq_ipm)) +
geom_point() +
labs(x = "", y = "ipm")
freq %>%
ggplot(aes(1:1000, freq_ipm))+
geom_point()+
labs(x = "", y = "ipm")
scale_y_log10()
## <ScaleContinuousPosition>
## Range:
## Limits: 0 -- 1
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms, color = orientation)) +
geom_point() +
geom_rug()
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms)) +
geom_point() +
geom_hline(yintercept = mean(homo$vowel.duration.ms))+
geom_vline(xintercept = 60)
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms)) +
geom_point() +
geom_hline(yintercept = 120, linetype = 2)+
geom_vline(xintercept = 60, size = 5)
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms)) +
geom_point() +
geom_hline(yintercept = 120, linetype = 4)+
geom_vline(xintercept = 60, color = "blue")
Функция annotate
добавляет geom
к графику.
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms)) +
geom_point()+
annotate(geom = "rect", xmin = 77, xmax = 79,
ymin = 117, ymax = 122, fill = "red", alpha = 0.2) +
annotate(geom = "text", x = 78, y = 125,
label = "Who is that?\n Outlier?")
In dataset diamonds
calculate mean value of the variable price
for each cut
and visualise it using argument shape = 5
.
There are two possible situations:
head(homo[, c(1, 9)])
head(homo[, c(1, 10)])
homo %>%
ggplot(aes(orientation)) +
geom_bar()
homo %>%
ggplot(aes(speaker, age)) +
geom_col()
homo %>%
ggplot(aes(speaker, age, fill = orientation)) +
geom_col()
homo %>%
ggplot(aes(orientation, s.duration.ms)) +
geom_boxplot()
homo %>%
ggplot(aes(orientation, s.duration.ms)) +
geom_boxplot()+
geom_point()
homo %>%
ggplot(aes(orientation, s.duration.ms)) +
geom_boxplot() +
geom_jitter(width = 0.5)
homo %>%
ggplot(aes(orientation, s.duration.ms)) +
geom_violin() +
geom_jitter()
In dataset diamonds
calculate mean, maximum and minimum value of the variable price
for each color
and visualise it using arguments color = "blue"
for maximum value and color = "red"
for minimum value and automatic coloring for mean value.
mtcars %>%
mutate(newvar = mpg > 22,
newvr = mpg < 17) %>%
ggplot(aes(newvr, newvar, color = newvar))+
geom_jitter(width = 0.2)
mtcars %>%
mutate(newvar = mpg > 22,
newvr = mpg < 17) %>%
group_by(newvar, newvr) %>%
summarise(number = n()) %>%
ggplot(aes(newvr, newvar, label = number))+
geom_point(aes(size = number, color = newvar))+
geom_text()+
scale_size(range = c(10, 30))+
guides(size = F)
homo %>%
ggplot(aes(s.duration.ms)) +
geom_histogram()
How many histogram bins do we need?
nclass.Sturges(homo$s.duration.ms)
nclass.scott(homo$s.duration.ms)
nclass.FD(homo$s.duration.ms)
homo %>%
ggplot(aes(s.duration.ms)) +
geom_histogram(bins = nclass.FD(homo$s.duration.ms))
homo %>%
ggplot(aes(s.duration.ms)) +
geom_histogram(fill = "lightblue")
Create a stacked histogram for the variable price
and color by variable cut
Create a side-to-side histogram for the variable price
and color by variable cut
using argument position = "dodge"
.
homo %>%
ggplot(aes(s.duration.ms)) +
geom_density()
homo %>%
ggplot(aes(s.duration.ms)) +
geom_density(color = "blue")
homo %>%
ggplot(aes(s.duration.ms)) +
geom_density(fill = "lightblue")
homo %>%
ggplot(aes(s.duration.ms, fill = orientation)) +
geom_density()
homo %>%
ggplot(aes(s.duration.ms, fill = orientation)) +
geom_density(alpha = 0.2)
library(ggridges)
homo %>%
ggplot(aes(s.duration.ms, orientation, fill = orientation)) +
geom_density_ridges()
Create a density plot for the variable price
and color by variable cut
. Use argument alpha = 0.5
.
ggplot2::facet_wrap()
homo %>%
ggplot(aes(speaker, s.duration.ms))+
geom_point() +
facet_wrap(~orientation)
homo %>%
ggplot(aes(speaker, s.duration.ms))+
geom_point() +
facet_wrap(~orientation, scales = "free")
homo %>%
ggplot(aes(speaker, s.duration.ms))+
geom_point() +
facet_wrap(~orientation, scales = "free_x")
ggplot2::facet_grid()
homo %>%
mutate(older_then_28 = ifelse(age > 28, "older", "younger")) %>%
ggplot(aes(speaker, s.duration.ms))+
geom_point() +
facet_wrap(older_then_28~orientation, scales = "free_x")
homo %>%
mutate(older_then_28 = ifelse(age > 28, "older", "younger")) %>%
ggplot(aes(speaker, s.duration.ms))+
geom_point() +
facet_grid(older_then_28~orientation, scales = "free_x")
There is also nice argument margins
:
homo %>%
mutate(older_then_28 = ifelse(age > 28, "older", "younger")) %>%
ggplot(aes(speaker, s.duration.ms))+
geom_point() +
facet_grid(older_then_28~orientation, scales = "free_x", margins = TRUE)
Sometimes it is nice to show all data on each facet:
homo %>%
ggplot(aes(speaker, s.duration.ms))+
# Add an additional geom without facetization variable!
geom_point(data = homo[,-9], aes(speaker, s.duration.ms), color = "grey") +
geom_point() +
facet_wrap(~orientation)+
theme_bw()
Reproduce the graph: