library(mosaic) ## ------------------------------------------------------------------------ mydata <- read.csv("http://statistics.uchicago.edu/~collins/data/datafile.csv") ## ------------------------------------------------------------------------ mydata <- read.csv("datafile.csv") ## ------------------------------------------------------------------------ glimpse(mydata) # info on variable names, types, and values summary(mydata) # summary of the values in each column (labels, variables) head(mydata, 10) # see the first 10 rows (cases) tail(mydata, 7) # see the last 7 rows (cases) names(mydata) # just look at the names of the dataset columns (labels, variables) ########################################################################### # Numerical Summaries ########################################################################### ## ------------------------------------------------------------------------ mean(y ~ g, data = mydata) ## ------------------------------------------------------------------------ mean(mydata$y[ mydata$g == "group1" ]) mean(mydata$y[ mydata$g == "group2" ]) mean(mydata$y[ mydata$g == "group3" ]) ## ------------------------------------------------------------------------ tapply(mydata$y, mydata$g, mean) ## ------------------------------------------------------------------------ mean(y ~ g, data = mydata) ## ------------------------------------------------------------------------ # sample average, sd, and variance mean(~ x, data = mydata) sd(~ x, data = mydata) var(~ x, data = mydata) # five-number summary: min, Q1, median, Q3, max min(~ x, data = mydata) quantile(~ x, data = mydata, probs = 0.25) median(~ x, data = mydata) quantile(~ x, data = mydata, probs = 0.50) quantile(~ x, data = mydata, probs = 0.75) max(~ x, data = mydata) fivenum(~ x, data = mydata) quantile(~ x, data = mydata, probs = c(0, 0.25, 0.50, 0.75, 1)) # five number summary, plus mean, sd, and more favstats(~ x, data = mydata) # separate summaries of a quantitative variable by groups/categories favstats(~ x | g, data = mydata) mean(~ x | g, data = mydata) sd(~ x | g, data = mydata) # summaries of x for each combination of values from two sets of groups/categories mean(~ x | g1 + g2, data = mydata) sd(~ x | g1 + g2, data = mydata) favstats(~ x | g1 + g2, data = mydata) # covariance and correlation cov(y ~ x, data = mydata) cor(y ~ x, data = mydata) ########################################################################### # Graphical Summaries ########################################################################### # separate boxplots for each group/category gf_boxplot(x ~ g, data = mydata) %>% gf_labs(x = "x-axis label", y = "y-axis label (with units)") gf_boxplot(x ~ g, data = mydata) %>% gf_labs(x = "x-axis label", y = "y-axis label (with units)") + coord_flip() # flip the plot to get horizontal boxplots # histogram gf_histogram(~ x, data = mydata, bins = 20, color = "white") # separate histograms for each group/category gf_histogram(~ x | g, data = mydata, bins = 15, color = "white") # scatterplot of y vs. x gf_point(y ~ x, data = mydata) %>% gf_labs(x = "x-axis label (with units)", y = "y-axis label (with units)") gf_point(y ~ x | g, data = mydata) # separate scatterplots for each group/category gf_point(y ~ x, color = ~ g, data = mydata) # one plot, color indicates group # one plot, color and size indicate diff groups gf_point(y ~ x, size = ~ g2, color = ~ g1, alpha = 0.5, data = mydata) # add regression line to a scatterplot gf_point(y ~ x, data = mydata) %>% gf_coefline(model = lm(y ~ x, data = mydata)) # normal quantile plot gf_qq(~ y, data = mydata) %>% gf_qqline() mydata <- mutate(mydata, z = (y - mean(y)) / sd(y)) # standardize the data gf_qq(~ z, data = mydata) %>% gf_qqline() # normal quantile plot with data standardized gf_qq(~ z | g, data = mydata) %>% gf_qqline() # separate quantile plots for each group/category ########################################################################### # Simple Linear Regression ########################################################################### # find the least squares regression equation and save the results myFit <- lm(y ~ x, data = mydata) # examine the results summary(myFit) coef(myFit) tidy(myFit) glance(myFit) # find the SSE for a model: Recall sigmahat = SSE / df sigmahat <- as.numeric( glance(myFit)[3] ) df <- as.numeric( glance(myFit)[11] ) SSE <- sigmahat * df c(sigmahat, df, SSE) # view the results # make a scatterplot of y vs. x and add the regression line to the plot gf_point(y ~ x, data = mydata) %>% gf_coefline(model = myFit) # you can add other lines to any scatterplot gf_point(y ~ x, data = mydata) %>% gf_abline(intercept = 100, slope = 4, linetype = 2) # dashed line: y = 100 + 4x gf_point(y ~ x, data = mydata) %>% gf_hline(yintercept = mean(~ y, data = mydata)) # horizontal line at ybar gf_point(y ~ x, data = mydata) %>% gf_vline(xintercept = 20, linetype = 2) # horizontal (dashed) line at x = 20 # or put all these lines on the same plot! Crazy! gf_point(y ~ x, data = mydata) %>% gf_coefline(model = myFit) %>% gf_abline(intercept = 100, slope = 4, linetype = 2) %>% gf_hline(yintercept = mean(~ y, data = mydata)) %>% gf_vline(xintercept = 20, linetype = 2) # find the fitted values, residuals, standardized residuals, and add them to the data frame mydata <- mutate(mydata, fitted = fitted(myFit), resids = residuals(myFit), sresids = rstandard(myFit) ) # plot the residuals vs. fitted values (with dashed line at y = 0) gf_point(resids ~ fitted, data = mydata) %>% gf_hline(yintercept = 0, linetype = 2) %>% gf_labs(x = "fitted values", y = "residuals") # plot the standardized residuals vs. fitted values (with dashed lines at y = -2, 0, 2) gf_point(sresids ~ fitted, data = mydata) %>% gf_hline(yintercept = c(-2, 0, 2), linetype = 2) %>% gf_labs(x = "fitted values", y = "standardized residuals")