# The basics -------------------------------------------------------------- # Calculate 1 + 1 1 + 1 # Now you try: calculate the sum of 562 and 9378 562 + 9378 # Addition 5 + 5 # Subtraction 5 - 5 # Multiplication 2 * 5 # Division 35 / 5 # Exponential 5 ^ 2 # Modulo - this is sometimes helpful for printing output every N iterations 5 %% 3 # Now you try: there are ~3000 cM in the genome and 22 chromosomes. # Calculate the average chromosome size 3000 / 22 # Assigning variables ----------------------------------------------------- # Assign the value of 1 to the variable called a a <- 1 a = 1 # Notice how the environment now has a. # You can print small variables to the console as follows: a # Try to use informative variable names. # Now you try: Assign 22 to a variable named autosomes autosomes <- 22 # Now you try: Print out the variable you just created autosomes # Now you try: Add the two variables that you've assigned together (a and autosomes) autosomes + a # Now we'll store the sum of the two variables into a new variable called chromosomes chromosomes <- a + autosomes # Data types -------------------------------------------------------------- # There are several different "types" in R b <- 'chromosomes' # Print out the variable b, notice that it's surrounded by quotes b # Notice what happens if you try to add incompatible types a + b # Most common include integer, character, double, logical, and list my_double <- 12.5 # Logicals (aka booleans) can be TRUE or FALSE. Can be abbreviated to T or F my_logical <- TRUE my_logical2 <- T my_logical3 <- F # TRUE can act as 1 and FALSE can act as 0 a + my_logical # Check types of variables, using the function typeof typeof(a) typeof(b) # Now you try: what is the type of my_double and my_logical typeof(my_logical) # Learn more about the function typeof with ?typeof # R help has similar structure for most functions: # Description, Usage, Arguments, Value, See Also, Examples ?typeof # Let's get an example of the function typeof: example(typeof) # Variables in R are case sensitive B <- 'CHROMOSOMES' # Check whether variables b and B are the same b == B # There are several comparison tools: <, >, <=, >=, ==, and != # Characters are ordered lexicographically B > b # Vector sandbox -------------------------------------------- # The function c let's you combine elements together number_vector <- c(1, 2, 3) character_vector <- c('a', 'b', 'c') # Now you try: make variable logical_vector, with values = TRUE, FALSE, and TRUE (in that order) logical_vector <- c(TRUE, FALSE, TRUE) logical_vector2 <- c(T, F, T) sum(logical_vector) # Check comprehension: what's going on here? hodgepodge_vector <- c(number_vector, character_vector) # Bonus: what's going on here? c(a, b, c) # We're going to use some often-used functions to understand how to do more useful data analysis # See reference card: https://cran.r-project.org/doc/contrib/Short-refcard.pdf # Fill a chromosome vector of lenth 22 with values from a random normal distribution # Mean = 0, var = 1 chromosome_vector <- rnorm(22) # Print the vector and assume values correspond to each chromosome chromosome_vector # Name the vector to keep track of which value corresponds to which names(chromosome_vector) <- paste0('chr', 1:22) # Now let's take another look at what our named vector looks like chromosome_vector # Make another random chromosome vector chromosome_vector2 <- rnorm(22) # Name new chromosome vector with same names as old vector names(chromosome_vector2) <- names(chromosome_vector) # Now you try: how do you check if the values of each chromosome vector are the same? chromosome_vector == chromosome_vector2 # Let's calculate the sum over all chromosomes sum(chromosome_vector) sum(chromosome_vector2) my_vector <- rnorm(100000) # Is the first vector's sum larger than the second? sum(chromosome_vector) > sum(chromosome_vector2) # What's the value on chromosome 6 chromosome_vector[6] # Alternatively chromosome_vector['chr6'] # What are the values are for the last chromosome chromosome_vector[length(chromosome_vector)] chromosome_vector['chr22'] chromosome_vector[22] # What the values are for the first 4 chromosomes chromosome_vector[1:4] # Now you try: what are the values for chromosomes 1, 5, and 9 (in one command)? chromosome_vector[c(1,5,9)] sum(chromosome_vector[c(1,5,9)]) chromosome_vector[sum(c(1,5,9))] # Get some statistics about the chromosome vector mean(chromosome_vector) min(chromosome_vector) max(chromosome_vector) # A useful function to look at these all at once summary(chromosome_vector) summary(my_logical) summary(logical_vector) # Let's quickly lok at the chromosome vector without printing the whole thing str(chromosome_vector) # Create a scatter plot with R plot(chromosome_vector) # Matrices ---------------------------------------------------------------- # A matrix has all the same type (e.g. numeric, character, logical) # This matrix is filled rowwise and has two columns number_matrix <- matrix(1:26, byrow=T, ncol=2) # Now you try, make a matrix called letter_matrix with 2 columns of the full alphabet, filled columnwise. # Hint: try using the letters function # Now let's say we have the following vectors vecA <- 1:5 vecB <- rnorm(5) vecC <- letters[1:5] # Try putting these vectors together my_matrix <- matrix(c(vecA, vecB, vecC), ncol=3) # Note that a number can be a character, but a character isn't a number as.numeric('a') # Characters don't give us very useful data summaries summary(my_matrix) # We can access just the numeric parts though # First number is rows, 2nd is columns, separated by a comma. # If a number is left out, you're asking for all values my_matrix_topcols <- matrix(as.numeric(my_matrix2[,1:2]), ncol=2) my_matrix_toprows <- my_matrix2[1:2,] # Check comprehension: why was it so complicated to make my_matrix_topcols? # Give the matrix some more interesting column names colnames(my_matrix_topcols) <- c('integers', 'random') # Now you try: what are the min, max, and mean of both columns of my_matrix_topcols (in one command)? # Get the column sums and column means of my_matrix_topcols colSums(my_matrix_topcols) colMeans(my_matrix_topcols) # You can also modify every element of a matrix 5 * my_matrix_topcols # Add another numeric column with 5 random values from 1 to 100 to my_matrix_topcols cbind(my_matrix_topcols, round(runif(5, min=1, max=100))) # Factors ----------------------------------------------------------------- # Assign a vector of study genders gender <- c('Male', 'Female', 'Male', 'Male', 'Female', 'Female') # Summarize the gender vector summary(gender) # Hmm... that's not very helpful. Let's try converting it to a factor gender_factor <- factor(gender) summary(gender_factor) # Create a factor vector of temperatures temperature <- factor(c("High", "Low", "High","Low", "Medium")) temperature # Hmm... these aren't in logical order temperature_order <- factor(temperature, levels=c('Low', 'Medium', 'High')) # We might have data coded slightly differently than we want survey_vector <- factor(c("M", "F", "F", "M", "M")) # Change the name of the factor levels levels(survey_vector) <- c('Female', 'Male') # Check comprehension: why did I list female first when relabeling the factor levels? # The levels of the factor are important, because they can indicate the order of data speed <- factor(c("fast", "slow", "slow", "fast", "insane"), ordered=T, levels=c('slow', 'fast', 'insane')) summary(speed) # Data frames ------------------------------------------------------------- # This is the data type you'll use most often. R has some example data that we'll consider mtcars # Just look at the top of mtcars head(mtcars) # Get a brief description of mtcars str(mtcars) plot(mtcars[,1], mtcars[,2]) plot(mtcars[,'mpg'], mtcars[,'cyl']) plot(mtcars$mpg, mtcars$cyl) # Let's define some planet-related vectors name <- c("Mercury", "Venus", "Earth", "Mars", "Jupiter", "Saturn", "Uranus", "Neptune") type <- c("Terrestrial planet", "Terrestrial planet", "Terrestrial planet", "Terrestrial planet", "Gas giant", "Gas giant", "Gas giant", "Gas giant") diameter <- c(0.382, 0.949, 1, 0.532, 11.209, 9.449, 4.007, 3.883) rotation <- c(58.64, -243.02, 1, 1.03, 0.41, 0.43, -0.72, 0.67) rings <- c(FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE) # Create a data frame from the vectors planets_df <- data.frame(name=name, type=type, diameter=diameter, rotation=rotation, rings=rings) planets_df <- data.frame(name=c("Mercury", "Venus", "Earth", "Mars", "Jupiter", "Saturn", "Uranus", "Neptune"), type=type, diameter=diameter, rotation=rotation, rings=rings) data.frame(name, type, diameter, rotation, rings) # Check the structure of this data frame str(planets_df) # Now you try: print out diameter of Mercury (row 1, column 3) planets_df[1,3] planets_df[1,'diameter'] # Now you try: print out data for Mars (entire fourth row) planets_df[4,] # Make of table of planet rings by type table(planets_df$type, planets_df$rings) # Select all planets with rings planets_df[rings,] planets_df[!rings,] # Now you try: what is the mean diameter of planets with rings mean(planets_df[rings,'diameter']) my_type <- planets_df$type == 'Gas giant' # Subset is a useful command for filtering data frames subset(planets_df, diameter < 1) # Order can give you a vector of relative value orders order(planets_df[,'diameter']) # You can access columns of data frames with a $ symbol planets_df$diameter # Reorder data frame by value of diameter planets_df_order <- planets_df[order(planets_df$diameter),] # Some data of our own ---------------------------------------------------- # Download this file from the 1000 Genomes Project: https://personal.broadinstitute.org/armartin/tgp_admixture/tutorial/integrated_call_samples_v3.20130502.ALL.panel # Set a directory on your own computer where you downloaded this data # setwd('/Users/alicia/daly_lab/Travel/GINGER/teaching/R_tutorial') setwd('/Users/alicia/Desktop/') # load this as a data frame tgp_data <- read.table('integrated_call_samples_v3.20130502.ALL.panel.txt', header=T) # look at the data head(tgp_data) # get a data summary summary(tgp_data) str(tgp_data) # Now you try: how many males and females are there? # Now you try: what are the different super populations, and how many are there? # Load PCA data for these individuals tgp_pca <- read.table('ALL.phase3_shapeit2_snps_integrated_v5.20130502.genotypes.maf05.geno1.ld50.out.evec.txt') # Install a library called dplyr, which helps a ton with data processing/manipulation install.packages(c('dplyr', 'tidyr', 'ggplot2')) # Now load dplyr so we can use it. Note: we usually load libraries at the top of scripts # There is a ton of functionality in this package. # See cheat sheet here: https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf library(plyr) library(dplyr) library(tidyr) library(ggplot2) head(tgp_pca) # Look at the packages tab of your bottom right window # The %>% function acts as a "pipe" (equivalent to | in bash). # This means, take the data I passed on, do something to it, and continue pass it on to another function # It takes some time to wrap your head around this, but is really powerful for data manipulation once you get the hang of it # Note that if you're piping sequences of commands together, the first argument is assumed from the pipe tgp_pca %>% head() # Separate the first column tgp_pca %>% separate(V1, c('Pop', 'Ind')) %>% head() new_var <- tgp_pca %>% separate(V1, c('Pop', 'Ind')) ggplot(new_var, aes(x=V2, y=V3, color=Pop)) + geom_point() my_merge <- merge(tgp_data, new_var, by.x='sample', by.y='Ind') ggplot(my_merge, aes(x=V2, y=V3, color=super_pop)) + geom_point() + xlab('PC1') + ylab('PC2') + ggtitle('my pca plot') ggplot(my_merge, aes(x=V4, y=V5, color=super_pop)) + geom_point() # Rename columns tgp_pca %>% rename(Ind=V1, PC1=V2, PC2=V3, PC3=V4, PC4=V5, PC5=V6, Pheno=V7) %>% head() # Separate first column, rename columns, and store this back into the same variable name tgp_pca <- tgp_pca %>% separate(V1, c('Pop', 'Ind')) %>% rename(PC1=V2, PC2=V3, PC3=V4, PC4=V5, PC5=V6, Pheno=V7) # Check comprehension: why didn't we set V1 in the rename? # Now let's merge the PCA data and the overall data info tgp_merge <- merge(tgp_data, tgp_pca, by.x='sample', by.y='Ind') # Now we have two columns with the same values (pop and Pop). Double check to be sure sum(tgp_merge$pop!=tgp_merge$Pop) # You can merge on multiple columns (this gives only one pop column) tgp_merge <- merge(tgp_data, tgp_pca, by.x=c('sample', 'pop'), by.y=c('Ind', 'Pop')) # Let's try plotting some of the PCs! plot(tgp_merge$PC1, tgp_merge$PC2) # Hmm, maybe color by population will help? plot(tgp_merge$PC1, tgp_merge$PC2, col=tgp_merge$super_pop) # This plot would look nicer and be interpreted with a little work in ggplot # First we have to tell ggplot where the data is coming from # Then we have to tell it which aesthetics (aes) we want to use # Then we tell it what kind of plot to make pca_plot <- ggplot(tgp_merge, aes(x=PC1, y=PC2, color=super_pop)) + geom_point() # The grey background is kind of ugly. Note that you can keep stringing together plotting commands with + pca_plot + theme_bw() # Try out different coloring schemes # RColorBrewer has some nice ones that you can explore here: http://colorbrewer2.org/ pca_plot + theme_bw() + scale_color_brewer(palette='Set1') # Now you try: make a PCA plot of PCs 3 and 4, coloring by population # You can make many different kinds of plots very easily in ggplot. # Try exploring with: https://www.rstudio.com/wp-content/uploads/2015/03/ggplot2-cheatsheet.pdf