% NOTE -- ONLY EDIT THE .Rnw FILE!!! The .tex file is % likely to be overwritten. \documentclass[compress]{beamer} \usepackage{SweaveBeamer} \input{commondefs} \SweaveOpts{prefix.string=figs/introduction,eps=FALSE,pdf=TRUE,keep.source=TRUE} \title{A Quick Introduction to \R{}} \subtitle{Introduction and examples} \begin{document} \begin{frame} \titlepage \end{frame} <>= options(width = 60) @ \begin{frame} \frametitle{What is \R{}} {\it \R\ provides an environment in which you can perform statistical analysis and produce graphics. It is actually a complete programming language, although that is only marginally described in this book.} \\ ~~~---Peter Dalgaard, \textbf{``Introductory Statistics with R''}, 2002 \begin{itemize} \item \R\ can be used as a toolbox for standard statistical techniques. \item Some knowledge of \R\ programming essential to use it well. \item For advanced users, the main appeal of \R\ is as a programming environment suited to data analysis. \end{itemize} More information available at the R Project homepage:\\ \curl{http://www.r-project.org} \end{frame} \begin{frame} \frametitle{Our goal for the first set of lectures} \ldots is basically to get comfortable using \R. We will learn \begin{itemize} \item to do some elementary statistics \item to use the documentation / help system \item about the language basics \item about data manipulation \end{itemize} We will learn about other specialized tools later when they are required. \end{frame} \begin{frame} \frametitle{Plan} \begin{itemize} \item Overview \begin{itemize} \item Interacting with \R{} \item Basic concepts \item Examples \end{itemize} \item Revisit some aspects in more details \begin{itemize} \item Important aspects of the language \item Data manipulation \item Graphics %% \item Statistical Models \end{itemize} \item Whatever else seems interesting \ldots \item Reference: Dalgaard, \emph{``Introductory Statistics with R''} \item More references: \curl{http://www.r-project.org/doc/bib/R-jabref.html} \end{itemize} \end{frame} \begin{frame} \frametitle{Interacting with \R} \R\ usually works interactively, using a question-and-answer model: \begin{itemize} \item Start \R \item Type a command and press \texttt{Enter} \item \R\ executes this command (often printing the result) \item \R\ then waits for more input \item Type \code{q()} to exit \end{itemize} \end{frame} \section{Examples} \begin{frame}[fragile] \frametitle{Simple Examples} <>= 2 + 2 exp(-2) ## exponential function log(100, base = 10) runif(10) @ \begin{itemize} \item The last command generates 10 $U(0, 1)$ random variables. \item The result (printed) is a vector of 10 numbers. \item \Rfunction{exp}, \Rfunction{log}, and \Rfunction{runif} are \emph{functions}. \item Most useful things in \R\ are done by functions. \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Simple Examples: plotting} \begin{center} <>= plot(runif(10)) @ \end{center} \end{frame} \begin{frame}[fragile] \frametitle{Variables} \begin{itemize} \item \R\ has \emph{symbolic variables} which can be assigned values. \item Assignment is done using the \code{'<-'} operator. \item The more \code{C}-like \code{'='} also works (with some exceptions). <>= x <- 2 yVar2 = x + 3 s <- "this is a character string" x yVar2 s x + x @ \end{itemize} \end{frame} \begin{frame} \frametitle{Variables} Possible variable names are very flexible. However, note that \begin{itemize} \item variable names cannot start with a digit \item names are case-sensitive \item some common names are already used by \R, e.g., \\ \code{c, q, t, C, D, F, I, T}, \\ and should be avoided \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Vectorized arithmetic} \begin{itemize} \item The elementary data types in \R\ are all vectors %% \item Even the scalars shown earlier are actually stored as vectors of length 1 \item The \code{c(...)} construct can be used to create vectors: <>= weight <- c(60, 72, 57, 90, 95, 72) weight @ \item To generate a vector of regularly spaced numbers, use <>= seq(0, 1, length = 11) 1:10 @ \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Vectorized arithmetic} \begin{itemize} \item Common arithmetic operations (including \code{+, -, *, /, \^}) and mathematical functions (e.g. \code{sin, cos, log}) work \emph{element-wise} on vectors, and produce another vector: <>= height <- c(1.75, 1.80, 1.65, 1.90, 1.74, 1.91) height^2 bmi <- weight / height^2 bmi log(bmi) @ \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Vectorized arithmetic} \begin{itemize} \item When two vectors are not of equal length, the shorter one is \emph{recycled}. The following adds $0$ to all the odd elements and $2$ to all the even elements of \code{1:10}: <>= 1:10 + c(0, 2) @ \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Scalars from Vectors} \begin{itemize} \item Many functions summarize a data vector by producing a scalar from a vector. For example <>= sum(weight) length(weight) avg.weight <- mean(weight) avg.weight @ \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Graphics} \begin{itemize} \item The simplest way to produce \R\ graphics output is to use the \Rfunction{plot} function: \begin{center} <>= plot(x = height, y = weight) @ \end{center} \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Graphics} \begin{itemize} \item Optional arguments control details of the plot \item Once created, plots can also be enhanced \begin{center} <>= plot(x = height, y = weight, pch = 2, col = "red") hh <- c(1.65, 1.70, 1.75, 1.80, 1.85, 1.90) lines(x = hh, y = 22.5 * hh^2) title(main = "weight vs height, with BMI = 22.5 curve") @ \end{center} \end{itemize} \end{frame} \section{Descriptive Statistics} \begin{frame}[fragile] \frametitle{Descriptive Statistics} Simple summary statistics: \emph{mean, median, s.d., variance} <>= x <- rnorm(100) mean(x) sd(x) var(x) median(x) @ \end{frame} \begin{frame}[fragile] \frametitle{Descriptive Statistics (contd)} Simple summary statistics: \emph{quantiles, inter-quartile range} <>= xquants <- quantile(x) xquants xquants[4] - xquants[2] IQR(x) quantile(x, probs = c(0.2, 0.4, 0.6, 0.8)) @ \end{frame} \begin{frame}[fragile] \frametitle{The \Rfunction{summary} function} \begin{itemize} \item When applied to a numeric vector, \Rfunction{summary} produces a nice summary display: <>= summary(x) @ \item The output of \Rfunction{summary} can be different when applied to other objects. \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{The Iris Dataset} \begin{itemize} \item Let's look at a real dataset: The Iris data is one of many already available in \R\ (type \code{data()} for a full list). \end{itemize} <<>>= head(iris) iris$Sepal.Length @ \end{frame} \begin{frame}[fragile] \frametitle{The Iris Dataset} <>= str(iris) @ \begin{itemize} \item The dataset contains measurements on 150 flowers, 50 each from 3 species: \textit{Iris setosa}, \textit{versicolor} and \textit{virginica}. \item It is typically used to illustrate the problem of \emph{classification}--- given the four measurements for a new flower, can we predict its Species? \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{The \Rfunction{summary} function revisited} <>= summary(iris) @ \begin{itemize} \item Note the different format of the output. \item \code{Species} is summarized differently because it is a \emph{categorical variable} (more commonly called \emph{factor} in \R). \end{itemize} \end{frame} \section{Graphics} \begin{frame}[fragile] \frametitle{Graphical display: Strip Plots} \begin{itemize} \item Data analysis should always start with a graphical study \item The simplest plot of numeric data is a \emph{strip plot} \begin{center} <>= stripchart(x) ## x = rnorm(100) @ \end{center} \end{itemize} \end{frame} % \begin{frame}[fragile] % \frametitle{Strip Plots for the Iris data} % We can produce a similar plot with the Iris data. Unfortunately, it's % not possible to indicate which points came from which Species. % \begin{center} % <>= % stripchart(iris[,1:4], method = "jitter", pch = 16, % cex = 0.4, offset = 0.6, las = 1) % @ % \end{center} % \end{frame} \begin{frame} \frametitle{Add-on packages} \begin{itemize} \item Built-in \R\ graphics is not very effective for multivariate data \item Packages \begin{itemize} \item \R\ allows the use of add-on packages \item Usually a collection of new \R\ functions and datasets \item Can be used to extend the functionality of \R. \item Writing new packages fairly simple. \end{itemize} \item General-purpose \R\ packages for visualizing multivariate data \begin{itemize} \item \Rpackage{lattice} \item \Rpackage{ggplot2} \end{itemize} \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Grouped Display} \begin{itemize} \item More informative strip plot using \Rpackage{lattice} \item Needs data in a slightly different structure \end{itemize} <>= iris2 <- reshape(iris, varying = list(names(iris)[1:4]), v.names = "measure", timevar = "type", times = names(iris)[1:4], direction = "long") str(iris2, give.attr = FALSE) @ \end{frame} \begin{frame}[fragile] \frametitle{Grouped Display} \begin{center} <>= library(package = "lattice") stripplot(type ~ measure, iris2, groups = Species, jitter = TRUE, auto.key = list(columns = 3)) @ \end{center} <>= plot(trellis.last.object()) @ \end{frame} \begin{frame}[fragile] \frametitle{Histograms} \begin{itemize} \item Strip plots not useful for large data sets. \item Most popular graphical summary for numeric data: \emph{histogram}. \begin{center} <>= hist(x) @ \end{center} \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Density Plots} \begin{itemize} \item Density plots are generalized histograms. \begin{center} <>= plot(density(x)) @ \end{center} \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Grouped Density Plots} \begin{itemize} \item Again, \Rpackage{lattice} functions are more suitable for grouped data. \begin{center} <<>>= densityplot(~ measure | type, data = iris2, groups = Species, scales = "free", plot.points = FALSE) @ <>= plot(trellis.last.object()) @ \end{center} \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Grouped Histogram} \begin{center} <>= histogram(~measure | type + Species, iris2, nint = 25) @ <>= plot(trellis.last.object()) @ \end{center} \end{frame} \begin{frame}[fragile] \frametitle{Grouped Scatter Plot} \begin{center} <>= xyplot(Sepal.Length ~ Petal.Length, data = iris, groups = Species, aspect = 1, auto.key = list(space = "right")) @ <>= plot(trellis.last.object()) @ \end{center} \end{frame} \begin{frame}[fragile] \frametitle{Categorical Data} We have already seen one example: <>= summary(iris$Species) @ Let us try to predict the Species using other measurements. \begin{itemize} \item What's the best measure to use? \item What are good thresholds? \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Discretizing} A continuous measure can be converted into a factor using the \Rfunction{cut} function: <>= iris$PL.disc <- cut(iris$Petal.Length, breaks = c(0, 2.5, 4.75, 7)) iris$SL.disc <- cut(iris$Sepal.Length, breaks = c(0, 5.5, 6.25, 8)) str(iris) @ \end{frame} \begin{frame}[fragile] \frametitle{Tables} Association between categorical variables summarized by \emph{tables}. <>= PL.tab <- xtabs(~ PL.disc + Species, iris) SL.tab <- with(iris, table(SL.disc, Species)) PL.tab SL.tab @ \end{frame} \begin{frame}[fragile] \frametitle{Graphical Display of Tables: Bar chart} \begin{center} <>= par(mfrow = c(1,2)) barplot(PL.tab, beside = TRUE, main = "Petal Length") barplot(SL.tab, beside = TRUE, main = "Sepal Length") @ \end{center} \end{frame} \begin{frame}[fragile] \frametitle{Graphical Display of Tables: Bar chart} \begin{center} <<>>= barchart(t(SL.tab), horizontal = FALSE, stack = FALSE, auto.key = list(columns = 3), main = "Sepal Length") @ <>= plot(trellis.last.object()) @ \end{center} \end{frame} \begin{frame}[fragile] \frametitle{Higher Dimensional Tables} The built-in \code{Titanic} data set is a cross-tabulation of 4 characteristics of 2201 passengers on the Titanic <>= dimnames(Titanic) @ \end{frame} \begin{frame}[fragile] \frametitle{Titanic Survivors} \begin{center} <>= mosaicplot(Titanic, color = TRUE) @ \end{center} \end{frame} \begin{frame}[fragile] \frametitle{Titanic Survivors (simplified)} \begin{center} <>= mosaicplot(apply(Titanic, c(1, 2, 4), sum), color = TRUE) @ \end{center} \end{frame} %% The R help system and how to use it \section{Help} \begin{frame}[fragile] \frametitle{Getting help} \R\ has too many tools for anyone to remember them all, so it is very important to know how to find relevant information using the help system. \begin{itemize} \item \code{help.start()} \\ Starts a browser window with an HTML help interface. One of the best ways to get started. Has links to a very detailed manual for beginners called `An Introduction to \R{}', as well as topic-wise listings. \item \code{help(topic)} \\ Displays the help page for a particular topic or function. Every \R\ function has a help page. \item \code{help.search("search string")} \\ Subject/keyword search \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Getting help (contd)} \begin{itemize} \item The \Rfunction{help} function provides topic-wise help. When you know which function you are interested in, this is usually the best way to learn how to use it. There's also a short-cut for this; use a question mark (\code{?}) followed by the topic. The following are equivalent: <>= help(plot) ?plot @ \item When you want to know about a specific subject, but don't know which particular help page has the information, the \Rfunction{help.search} function (shortcut: \code{??}) is very useful. For example, try <>= help.search("logarithm") ??logarithm @ \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Getting help (contd)} The help pages can be opened in a browser as well: <>= help(plot, help_type = "html") @ The help pages are usually detailed (but terse). Among other things, they often contain \begin{itemize} \item A `See Also' section that lists related help pages \item A Description of what the function returns \item An `Examples' section, with actual code illutrating how to use the documented functions. These examples can actually be run directly using the \code{example} function. e.g., try <>= example(plot) @ \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{\Rfunction{apropos}} Another useful tool is the \Rfunction{apropos} function: <>= apropos("plot") @ \end{frame} %% R packages \section{Packages} \begin{frame} \frametitle{\R\ packages} \R\ makes use of a system of \emph{packages} \begin{itemize} \item Each package is a collection of routines with a common theme \item The core of \R\ itself is a package called \Rpackage{base} \item A collection of packages is called a \emph{library} \item Some packages are already loaded when \R\ starts up. Other packages need be loaded using the \Rfunction{library} function \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{\R\ packages} Several packages come pre-installed with \R{}. <>= ip <- installed.packages() rownames(ip)[ip[, "Priority"] %in% c("base", "recommended")] @ \end{frame} \begin{frame}[fragile] \frametitle{\R\ packages} \begin{itemize} \item There are also many (more than 300) other packages contributed by various users of \R\ available online, from the Comprehensive \R\ Archive Network (\emph{CRAN}): \\ \curl{http://cran.fhcrc.org/web/packages/} \item The \emph{Bioconductor} project provides an extensive collection of \R\ packages specifically for bioinformatics \\ \curl{http://www.bioconductor.org/packages/release/Software.html} \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{\R\ packages} \begin{itemize} \item It is fairly easy for anyone to write new \R\ packages. This is one of the attractions of \R\ over other statistical software. \item Some packages are already loaded when \R\ starts up. At any point, The list of currently loaded packages can be listed by the \Rfunction{search} function: <>= search() @ \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{\R\ packages} \begin{itemize} \item Other packages can be loaded by the user. \item For example, the \Rpackage{ISwR} package contains datasets used in Dalgaard, ``Introductory Statistics with R''. \item This can be loaded by: <>= library(ISwR) @ \item New packages can be downloaded and installed using the \Rfunction{install.packages} function. \item For example, to install the \Rpackage{ISwR} package (if it's not already installed), one can use <>= install.packages("ISwR") library(help = ISwR) @ \item The last call gives a list of all help pages in the package. \end{itemize} \end{frame} \end{document}