\documentclass[10pt]{article}
\usepackage{amsmath,amssymb,amsthm}
\usepackage{fancyhdr,url,hyperref}
\usepackage{graphicx,xspace}
\usepackage{subfigure}
\usepackage{tikz}
\usetikzlibrary{arrows,decorations.pathmorphing,backgrounds,positioning,fit,through}
\oddsidemargin 0in %0.5in
\topmargin 0in
\leftmargin 0in
\rightmargin 0in
\textheight 9in
\textwidth 6in %6in
%\headheight 0in
%\headsep 0in
%\footskip 0.5in
\newtheorem{thm}{Theorem}
\newtheorem{cor}[thm]{Corollary}
\newtheorem{obs}{Observation}
\newtheorem{lemma}{Lemma}
\newtheorem{claim}{Claim}
\newtheorem{definition}{Definition}
\newtheorem{question}{Question}
\newtheorem{answer}{Answer}
\newtheorem{problem}{Problem}
\newtheorem{solution}{Solution}
\newtheorem{conjecture}{Conjecture}
\pagestyle{fancy}
\lhead{\textsc{Prof. McNamara}}
\chead{\textsc{SDS/MTH 220: Lecture notes}}
\lfoot{}
\cfoot{}
%\cfoot{\thepage}
\rfoot{}
\renewcommand{\headrulewidth}{0.2pt}
\renewcommand{\footrulewidth}{0.0pt}
\newcommand{\ans}{\vspace{0.25in}}
\newcommand{\R}{{\sf R}\xspace}
\newcommand{\cmd}[1]{\texttt{#1}}
\newcommand{\Ex}{\mathbb{E}}
\rhead{\textsc{November 10, 2017}}
\begin{document}
\paragraph{Agenda}
\begin{enumerate}
\itemsep0em
\item Chi-Squared test for independence
\end{enumerate}
%
% \paragraph{In-Class Exercise, OI, 3.40}
%
% \textbf{Evolution vs. creationism} A Gallup Poll released in December 2010 asked 1019 adults living in the Continental U.S. about their belief in the origin of humans. These results, along with results from a more comprehensive poll from 2001 (that we will assume to be exactly accurate), are summarized in the table below:
% \begin{center}
% \begin{tabular}{l c c}
% & \multicolumn{2}{c}{\textit{Year}} \\
% \cline{2-3}
% \textit{Response} & 2010 & 2001 \\
% \hline
% Humans evolved, with God guiding (1) & 38\% & 37\% \\
% Humans evolved, but God had no part in process (2) & 16\% & 12\% \\
% God created humans in present form (3) & 40\% & 45\% \\
% Other / No opinion (4) & 6\% & 6\% \\
% \hline
% \end{tabular}
% \end{center}
%
% \begin{enumerate}
% \itemsep1.5in
% \item Calculate the actual number of respondents in 2010 that fall in each response category.
% \item State hypotheses for the following research question: have beliefs on the origin of human life changed since 2001?
% \item Calculate the expected number of respondents in each category under the condition that the null hypothesis is true.
% \item Conduct a chi-square test and state your conclusion. (Reminder: verify conditions.)
% \end{enumerate}
\paragraph{Warmup}
% OI, 3.37
Determine if the statements below are true or false. For each false statement, suggest an alternative wording to make it a true statement.
\begin{enumerate}
\itemsep0.2in
\item The chi-square distribution, just like the normal distribution, has two parameters, mean and standard deviation.
\item The chi-square distribution is always right skewed, regardless of the value of the degrees of freedom parameter.
\item The chi-square statistic is always positive.
\item As the degrees of freedom increases, the shape of the chi-square distribution becomes more skewed.
\item As the degrees of freedom increases, the mean of the chi-square distribution increases.
% \item If you found X 2 = 10 with df = 5 you would fail to reject H 0 at the 5% significance level.
\item When finding the p-value of a chi-square test, we always shade the tail areas in both tails.
\item As the degrees of freedom increases, the variability of the chi-square distribution decreases.
\end{enumerate}
\paragraph{Independence}
Last time, we saw how to compare a sample from a categorical variable with multiple outcomes to a known distribution. This information could be summarized in a \emph{one-way table}, and we had three methods of constructing the sampling distribution for the test statistic $X^2$:
\begin{enumerate}
\item Simulation
\item Multinomial Distribution
\item $\chi^2$-approximation
\end{enumerate}
A \emph{two-way table} allows us to compare the relationship between two categorical variables, with at least two outcomes each. Here, one variable is broken down in terms of another, and we will again have three methods:
\begin{enumerate}
\item Simulation: Randomization Test (recall the mites and wilt disease, or the yawning experiment)
\item \href{http://en.wikipedia.org/wiki/Hypergeometric_distribution}{Hypergeometric Distribution}: We won't discuss this, but it is a well-known discrete probability distribution. See the \R command \cmd{dhyper} or \href{http://en.wikipedia.org/wiki/Fisher%27s\_exact_test}{Fisher's Exact Test} for more information.
\item $\chi^2$-approximation: this works basically the same as before. We compute the test statistic
$$
X^2 = \sum_{i=1}^k \sum_{j=1}^\ell Z_{ij}^2 = \sum_{i=1}^k \sum_{j=1}^\ell \left( \frac{observed_{ij} - expected_{ij}}{\sqrt{expected_{ij}}} \right)^2 = \sum_{i=1}^k \sum_{j=1}^\ell \frac{(observed_{ij} - expected_{ij})^2}{expected_{ij}} \,,
$$
where $i$ iterates over the $k$ possible values of one variable, and $j$ iterates over the $\ell$ possible values of another variable. $X^2$ will follow a $\chi^2$ distribution, where the number of degrees of freedom is equal to $(k-1) \cdot (\ell - 1)$.
\end{enumerate}
This type of analysis will allow us to evaluate the possibility that the two categorical variables are independent of one another.
\paragraph{In-Class Exercise, OI, 3.41}
\textbf{Offshore drilling, Part III} The table below summarizes a data set we first encountered in Exercise~3.29 that examines the responses of a random sample of college graduates and non-graduates on the topic of oil drilling. Complete a chi-square test for these data to check whether there is a statistically significant difference in responses from college graduate and non-graduates.
\begin{center}
\begin{tabular}{l c c}
& \multicolumn{2}{c}{\textit{College Grad}} \\
\cline{2-3}
& Yes & No \\
\cline{2-3}
Support & 154 & 132 \\
Oppose & 180 & 126 \\
Do not know & 104 & 131 \\
\cline{1-3}
Total & 438 & 389
\end{tabular}
\end{center}
\begin{enumerate}
\itemsep0.75in
\item State the null and alternative hypotheses.
\item Find the expected counts for all six cells under the null.
\item Compute the $X^2$ test statistic and carry out the hypothesis test.
\vspace{0.5in}
\end{enumerate}
\paragraph{In-Class Exercise, OI, 3.43}
A 2011 survey asked 806 randomly sampled adult Facebook users about their Facebook privacy settings. One of the questions on the survey was, ``Do you know how to adjust your Facebook privacy settings to control what people can and cannot see?"”" The
responses are cross-tabulated based on gender.
\begin{center}
\begin{tabular}{l c c}
& \multicolumn{2}{c}{\textit{Gender}} \\
\cline{2-3}
& Male & Female \\
\cline{2-3}
Yes & 288 & 378 \\
No & 61 & 62 \\
Not sure & 10 & 7 \\
\cline{1-3}
Total & 359 & 447
\end{tabular}
\end{center}
\begin{enumerate}
\itemsep0.75in
\item State appropriate hypotheses to test for independence of gender and whether or not Facebook users know how to adjust their privacy settings.
\item Verify any necessary conditions for the test and determine whether or not a chi-square test can be completed.
\end{enumerate}
%
% \newpage
%
% \paragraph{Solution to 3.41}
%
% We will complete a chi-squared test for independence. The hypotheses are as follows:
% \begin{description}
% \item[] $H_0$: The opinion of college grads and non-grads is not different on the topic of drilling for oil and natural gas off the coast of California.
% \item[] $H_A$: Opinions regarding the drilling for oil and natural gas off the coast of California has an association with earning a college degree.
% \end{description}
%
% Under $H_0$ the expected counts can be calculated as follows: \\
% \begin{minipage}[c]{0.5\textwidth}
% \begin{align*}
% E_{row~1, col~1} &= \frac{438 \times (154+132)}{827} = 151.5 \\
% E_{row~2, col~1} &= \frac{438 \times (180+126)}{827} = 162.1 \\
% E_{row~3, col~1} &= \frac{438 \times (104+131)}{827} = 124.5 \\
% \end{align*}
% \end{minipage}
% \begin{minipage}[c]{0.5\textwidth}
% \begin{align*}
% E_{row~1, col~2} &= \frac{389 \times (154+132)}{827} = 134.5 \\
% E_{row~2, col~2} &= \frac{389 \times (180+126)}{827} = 143.9 \\
% E_{row~3, col~2} &= \frac{389 \times (104+131)}{827} = 110.5 \\
% \end{align*}
% \end{minipage}
% Now that we have the expected counts, we can check if the conditions for inference are satisfied.
% \begin{description}
% \item Independence: The samples are both random, unrelated, and from less than 10\% of the population, so independence is reasonable.
% \item Sample size: All expected counts are at least 5.
% \item Degrees of freedom: $df = (R - 1) \times (C - 1) = (3 - 1) \times (2 - 1) = 2$
% \end{description}
%
% Since all conditions are met, we can continue with the hypothesis test. The chi-squared statistic, the degrees of freedom associated with it, and the p-value can be calculated as follows:
% \begin{align*}
% X^2 = \sum \frac{(O - E)^2}{E} &= \frac{(154 - 151.5)^2} {151.5} + \frac{(132 - 134.5)^2} {134.5} + \frac{(180 - 162.1)^2} {162.1} \\
% &+ \frac{(126 - 143.9)^2} {143.9} + \frac{(104 - 124.5)^2} {124.5} + \frac{(131 - 110.5)^2} {110.5} = 11.47 \\
% df &= 2 \\
% 0.001 &< p-value < 0.005
% \end{align*}
% Since the p-value $< \alpha$, we reject $H_0$. There is strong evidence that there is some difference in the rate of support for drilling for oil and natural gas off the Coast of California based on whether or not the respondent graduated from college. Support for off-shore drilling and having graduated from college do not appear to be independent.
\end{document}