# © 2018 - Guillaume Desagulier
# Companion file to "Why corpus linguists should be wary of kidney stones and Simpson’s paradox," in Around the word, 10/01/2018, https://corpling.hypotheses.org/?p=326.

# install and load the car package
install.packages("car")
library(car)

# we want 100 data points
n <- 100

set.seed(43)

# first group of data points
x1 <- rnorm(n, 50, 15)
y1 <- -.6*x1 + rnorm(n, 50, 30)

# second group of data points
x2 <- rnorm(n, 100, 15)
y2 <- -.6*x2 + rnorm(n, 100, 30)

# third group of data points
x3 <- rnorm(n, 150, 15)
y3 <- -.6*x3 + rnorm(n, 150, 30)

# all groups together
X <- c(x1, x2, x3)
Y <- c(y1, y2, y3)

# we plot the data, ignoring the confounding variable
scatterplot(X,Y, boxplots="", smooth=FALSE)

# we assign three groups
group <- c(rep("first", n), rep("second", n), rep("third", n)) 

# and plot the data
scatterplot(X,Y, col=c("magenta", "cyan4", "dodgerblue"), groups = group, legend.plot = FALSE, smooth=FALSE)