library(mosaic)

## ------------------------------------------------------------------------
mydata <- read.csv("http://statistics.uchicago.edu/~collins/data/datafile.csv")

## ------------------------------------------------------------------------
mydata <- read.csv("datafile.csv")

## ------------------------------------------------------------------------
glimpse(mydata)  # info on variable names, types, and values
summary(mydata)  # summary of the values in each column (labels, variables)
head(mydata, 10)  # see the first 10 rows (cases)
tail(mydata, 7)  # see the last 7 rows (cases)
names(mydata)  # just look at the names of the dataset columns (labels, variables)

###########################################################################
# Numerical Summaries
###########################################################################

## ------------------------------------------------------------------------
mean(y ~ g, data = mydata)

## ------------------------------------------------------------------------
mean(mydata$y[ mydata$g == "group1" ])
mean(mydata$y[ mydata$g == "group2" ])
mean(mydata$y[ mydata$g == "group3" ])

## ------------------------------------------------------------------------
tapply(mydata$y, mydata$g, mean)

## ------------------------------------------------------------------------
mean(y ~ g, data = mydata)

## ------------------------------------------------------------------------
# sample average, sd, and variance
mean(~ x, data = mydata)
sd(~ x, data = mydata)
var(~ x, data = mydata)
# five-number summary: min, Q1, median, Q3, max
min(~ x, data = mydata)
quantile(~ x, data = mydata, probs = 0.25)
median(~ x, data = mydata)
quantile(~ x, data = mydata, probs = 0.50)
quantile(~ x, data = mydata, probs = 0.75)
max(~ x, data = mydata)
fivenum(~ x, data = mydata)
quantile(~ x, data = mydata, probs = c(0, 0.25, 0.50, 0.75, 1))
# five number summary, plus mean, sd, and more
favstats(~ x, data = mydata)
# separate summaries of a quantitative variable by groups/categories
favstats(~ x | g, data = mydata)
mean(~ x | g, data = mydata)
sd(~ x | g, data = mydata)
# summaries of x for each combination of values from two sets of groups/categories
mean(~ x | g1 + g2, data = mydata)
sd(~ x | g1 + g2, data = mydata)
favstats(~ x | g1 + g2, data = mydata)
# covariance and correlation
cov(y ~ x, data = mydata)
cor(y ~ x, data = mydata)

###########################################################################
# Graphical Summaries
###########################################################################

# separate boxplots for each group/category
gf_boxplot(x ~ g, data = mydata) %>%
  gf_labs(x = "x-axis label", y = "y-axis label (with units)")
gf_boxplot(x ~ g, data = mydata) %>%
  gf_labs(x = "x-axis label", y = "y-axis label (with units)") + 
  coord_flip() # flip the plot to get horizontal boxplots
# histogram
gf_histogram(~ x, data = mydata, bins = 20, color = "white")
# separate histograms for each group/category
gf_histogram(~ x | g, data = mydata, bins = 15, color = "white") 
# scatterplot of y vs. x
gf_point(y ~ x, data = mydata) %>%
  gf_labs(x = "x-axis label (with units)", y = "y-axis label (with units)")
gf_point(y ~ x | g, data = mydata) # separate scatterplots for each group/category 
gf_point(y ~ x, color = ~ g, data = mydata) # one plot, color indicates group
# one plot, color and size indicate diff groups
gf_point(y ~ x, size = ~ g2, color = ~ g1, alpha = 0.5, data = mydata) 
# add regression line to a scatterplot
gf_point(y ~ x, data = mydata) %>%
  gf_coefline(model = lm(y ~ x, data = mydata))
# normal quantile plot
gf_qq(~ y, data = mydata) %>% gf_qqline()
mydata <- mutate(mydata, z = (y - mean(y)) / sd(y)) # standardize the data
gf_qq(~ z, data = mydata) %>% gf_qqline() # normal quantile plot with data standardized
gf_qq(~ z | g, data = mydata) %>% gf_qqline() # separate quantile plots for each group/category

###########################################################################
# Simple Linear Regression
###########################################################################

# find the least squares regression equation and save the results
myFit <- lm(y ~ x, data = mydata)
# examine the results
summary(myFit)
coef(myFit)
tidy(myFit)
glance(myFit)
# find the SSE for a model: Recall sigmahat = SSE / df
sigmahat <- as.numeric( glance(myFit)[3] )
df <- as.numeric( glance(myFit)[11] )
SSE <- sigmahat * df
c(sigmahat, df, SSE) # view the results
# make a scatterplot of y vs. x and add the regression line to the plot
gf_point(y ~ x, data = mydata) %>%
  gf_coefline(model = myFit)
# you can add other lines to any scatterplot
gf_point(y ~ x, data = mydata) %>%
  gf_abline(intercept = 100, slope = 4, linetype = 2) # dashed line: y = 100 + 4x
gf_point(y ~ x, data = mydata) %>%
  gf_hline(yintercept = mean(~ y, data = mydata)) # horizontal line at ybar
gf_point(y ~ x, data = mydata) %>%
  gf_vline(xintercept = 20, linetype = 2) # horizontal (dashed) line at x = 20
# or put all these lines on the same plot!  Crazy!
gf_point(y ~ x, data = mydata) %>%
  gf_coefline(model = myFit) %>%
  gf_abline(intercept = 100, slope = 4, linetype = 2) %>%
  gf_hline(yintercept = mean(~ y, data = mydata)) %>%
  gf_vline(xintercept = 20, linetype = 2)
# find the fitted values, residuals, standardized residuals, and add them to the data frame
mydata <- mutate(mydata,
                 fitted = fitted(myFit),
                 resids = residuals(myFit),
                 sresids = rstandard(myFit)
                )
# plot the residuals vs. fitted values (with dashed line at y = 0)
gf_point(resids ~ fitted, data = mydata) %>%
  gf_hline(yintercept = 0, linetype = 2) %>%
  gf_labs(x = "fitted values", y = "residuals")
# plot the standardized residuals vs. fitted values (with dashed lines at y = -2, 0, 2)
gf_point(sresids ~ fitted, data = mydata) %>%
  gf_hline(yintercept = c(-2, 0, 2), linetype = 2) %>%
  gf_labs(x = "fitted values", y = "standardized residuals")