\documentclass[10pt]{article}
\usepackage{amsmath,amssymb,amsthm}
\usepackage{fancyhdr,url,hyperref}
\usepackage{graphicx,xspace}
\usepackage{subfigure}
\usepackage{tikz}
\usetikzlibrary{arrows,decorations.pathmorphing,backgrounds,positioning,fit,through}
\oddsidemargin 0in %0.5in
\topmargin 0in
\leftmargin 0in
\rightmargin 0in
\textheight 9in
\textwidth 6in %6in
%\headheight 0in
%\headsep 0in
%\footskip 0.5in
\newtheorem{thm}{Theorem}
\newtheorem{cor}[thm]{Corollary}
\newtheorem{obs}{Observation}
\newtheorem{lemma}{Lemma}
\newtheorem{claim}{Claim}
\newtheorem{definition}{Definition}
\newtheorem{question}{Question}
\newtheorem{answer}{Answer}
\newtheorem{problem}{Problem}
\newtheorem{solution}{Solution}
\newtheorem{conjecture}{Conjecture}
\pagestyle{fancy}
\lhead{\textsc{Prof. McNamara}}
\chead{\textsc{SDS/MTH 220: Lecture notes}}
\lfoot{}
\cfoot{}
%\cfoot{\thepage}
\rfoot{}
\renewcommand{\headrulewidth}{0.2pt}
\renewcommand{\footrulewidth}{0.0pt}
\newcommand{\ans}{\vspace{0.25in}}
\newcommand{\R}{{\sf R}\xspace}
\newcommand{\cmd}[1]{\texttt{#1}}
\rhead{\textsc{October 4, 2017}}
\begin{document}
\paragraph{Agenda}
\begin{enumerate}
\itemsep0em
% \item Multiple regression
\item Inference through Randomization
\end{enumerate}
% \paragraph{Italian restaurant data}
% The Zagat guide contains restaurant ratings and reviews for many major world cities. We want to understand variation in the average $Price$ of a dinner in Italian restaurants in New York City. Specifically, we want to know how customer ratings (measured on a scale of 0 to 30) of the $Food$, $Decor$, and $Service$, as well as whether the restaurant is located to the $East$ or west of 5th Avenue, are associated with the average $Price$ of a meal. The data contains ratings and prices for 168 Italian restaurants in 2001.
%
% <>=
% library(mosaic)
% @
%
% <>=
% opts_chunk$set(tidy=TRUE, size='footnotesize')
% @
%
% <>=
% NYC <- read.csv("http://www.math.smith.edu/~bbaumer/mth241/nyc.csv")
% ggplot(data = NYC, aes(x = jitter(Service), y = Price)) +
% geom_point(alpha = 0.5, size = 2) + geom_smooth(method = "lm", se = 0) +
% xlab("Jittered service rating") + ylab("Average Price (US$)")
% #lm(Price ~ Service, data = NYC)
% @
%
% \begin{enumerate}
% \itemsep1.2in
% \item Use {\tt qplot()} to examine the bivariate relationships between $Price$, $Food$ and $Service$.
% \item What do you observe? Describe the form, direction, and strength of the relationships.
% \item Use {\tt lm()} to build a SLR model for $Price$ as a function of $Food$. Interpret the coefficients of this model. How is the quality of the food at these restaurants associated with its price?
% \item Build a parallel slopes model by conditioning on the $East$ variable. Interpret the coefficients of this model. What is the value of being on the East Side of Fifth Avenue?
% \vspace{1in}
% \end{enumerate}
%
% <<>>=
% m1 <- lm(Price~Food, data=NYC)
% coef(m1)
% m2 <- lm(Price~Food+East, data=NYC)
% coef(m2)
% @
%
% \begin{enumerate}
% \itemsep1.3in
% \item Interpret the coefficients from the first model. What shape is it?
% \clearpage
% \item Interpret the coefficients from the second model. What is the value of being on the East Side of Fifth Avenue? What shape is the model?
% \item Calculate the expected $Price$ of a restaurant in the East Village with a $Food$ rating of 23.
% \vspace{1in}
% \end{enumerate}
%
% \paragraph{Multiple Regression with a Second Quantitative Variable}
% If $X_2$ is a quantitative variable, then we have
%
% $$
% \hat{Y} = \hat{\beta}_0 + \hat{\beta}_1 \cdot X_1 + \hat{\beta}_2 \cdot X_2
% $$
% Notice that our model is no longer a line, rather it is a \emph{plane} that lives in three dimensions!
% \paragraph{Italian Restaurants (continued)}
% Now suppose that we want to improve our model by considering not only the quality of the $Food$, but also the quality of the $Service$. We can do this in {\tt R} by simply adding another variable to our regression model.
%
% <>=
% m3 <- lm(Price ~ Food + Service, data = NYC)
% coef(m3)
% @
%
% Now, our model is a plane!
% \begin{figure}[htbp]
% \includegraphics[height=4in]{../gfx/model_plane}
% \end{figure}
%
% <>=
% require(rgl)
% opacity <- 0.4
% with(NYC, plot3d(x = Food, y = Service, z = Price, type = "s", size = 0.3,
% col = "blue", xlab = "Food Rating", ylab = "Service Rating",
% zlab = "Price ($)"))
% coefs <- coef(m3)
% planes3d(coefs["Food"], coefs["Service"], -1, coefs["(Intercept)"], alpha = opacity,
% col = "lightgray")
% @
%
% % \begin{enumerate}
% \itemsep0.7in
% \item Interpret the coefficients of this model. What does the coefficient of $Food$ mean in the real-world context of the problem? $Service$?
% \item How important is $Service$ relative to $Food$? Is it fair to compare the two coefficients?
% \item Use {\tt makeFun()} to find the expected $Price$ of a restaurant with a $Food$ rating of 21 but a $Service$ rating of 28.
% \item Calculate the residual for \href{http://www.zagat.com/r/san-pietro-new-york}{San Pietro}. Is it overpriced?
%
% <>=
% filter(NYC, Restaurant == "San Pietro")
% @
%
% If we add all three explanatory variables to the model, it becomes two parallel planes.
%
% <<>>=
% m4 <- lm(Price ~ Food + Service + East, data=NYC)
% coef(m4)
% @
%
%
% \begin{figure}[htbp]
% \includegraphics[height=4in]{../gfx/model_parallelplanes}
% \end{figure}
%
% What if we added an additional numeric variable? What shape would the model have then?
% \vspace{1in}
%
% <>=
% with(NYC, plot3d(x = Food, y = Service, z = Price, type = "s", size = 0.8,
% col = "blue", xlab = "Food Rating", ylab = "Service Rating",
% zlab = "Price ($)"))
% coefs <- coef(lm(Price ~ Food + Service + East, data=NYC))
% planes3d(coefs["Food"], coefs["Service"], -1, coefs["(Intercept)"], alpha = opacity, col = "lightgray")
% planes3d(coefs["Food"], coefs["Service"], -1, coefs["(Intercept)"] + coefs["East"], alpha = opacity, col = "lightgray")
% @
%
\paragraph{Inference through Randomization}
Researchers suspect that the attack on a plant by one organism induces resistance to subsequent attacks by a different organism. Individually potted cotton plants were randomly allocated to two groups: one group that was infested by spider mites; and another group that was not. After two weeks the mites were dutifully removed by a conscientious research assistant, and both groups were inoculated with Verticillium, a fungus that causes wilt disease.
The following table shows the number of plants that developed symptoms of wilt disease.
<>=
require(mosaic)
tally(~ outcome + treatment, data = Mites)
@
Big question: Is there a relationship between infestation and wilt disease?
\paragraph{Activity: Simulation Design}
Your objective is to answer the big question using a simulation of your own design. You may want to refer back to the yawning activity that we did on the first day of class. You're welcome to use the cards in whatever manner is most useful. \emph{Outline your procedure in detail} below, and be sure to touch on the following questions.
\begin{enumerate}
\itemsep1in
\item What does each card represent?
\item Upon what assumptions does your simulation rely?
\item What role does randomization play in the simulation?
\item What statistic will you be recording from each simulation?
\end{enumerate}
\vspace{0.5in}
\paragraph{Results}
Record your simulated statistics below, first as a table, then as a dot plot.
\vspace{3.5in}
\begin{enumerate}
\itemsep1in
\item Describe the center, shape, and spread of this randomization distribution.
\item In roughly what range would you expect to generate statistics under this model? What range would be very unusual?
\item Where does the observed statistic fall in this distribution? What conclusions do you draw concerning the association of mites and wilt disease?
\vspace{1in}
\end{enumerate}
\clearpage
\paragraph{Randomization test summary}
Our goal here is to assess the likelihood that exposure to mites was associated, \emph{to a statistically significant} degree, with a decrease in wilt disease after exposure to Verticillium, a fungus that causes wilt disease.
\begin{enumerate}
\itemsep0.75in
\item What was the \emph{null hypothesis} for your simulation?
\item What was the \emph{test statistic}?
\item Where did the test statistic lie in the \emph{null distribution}?
\item Did this evidence cause you to \emph{reject} or \emph{fail to reject} the null hypothesis?
\item Write \emph{one} sentence summarizing what you've learned about mites and wilt disease.
\vspace{0.5in}
\end{enumerate}
<>=
require(mosaic)
tally(~ outcome + treatment, data = Mites)
@
<>=
null_dist <- do(5000) * tally(~ outcome + shuffle(treatment), data = Mites)
ds <- tidyr::gather(null_dist, key = type, value = N)
qplot(data = ds, x = N, facets = ~type, binwidth = 1)
2 * pdata(~N, q = 11, data = filter(ds, type == "wilt.mites"))
2 * pdata(~N, q = 17, data = filter(ds, type == "wilt.no.mites"), lower.tail = FALSE)
2 * pdata(~N, q = 15, data = filter(ds, type == "no.wilt.mites"), lower.tail = FALSE)
2 * pdata(~N, q = 4, data = filter(ds, type == "no.wilt.no.mites"))
@
\end{document}