# The basics --------------------------------------------------------------

# Calculate 1 + 1
1 + 1

# Now you try: calculate the sum of 562 and 9378
562 + 9378

# Addition
5 + 5

# Subtraction
5 - 5

# Multiplication
2 * 5

# Division
35 / 5

# Exponential
5 ^ 2

# Modulo - this is sometimes helpful for printing output every N iterations
5 %% 3

# Now you try: there are ~3000 cM in the genome and 22 chromosomes.
# Calculate the average chromosome size
3000 / 22

# Assigning variables -----------------------------------------------------

# Assign the value of 1 to the variable called a
a <- 1
a = 1

# Notice how the environment now has a. 
# You can print small variables to the console as follows:
a

# Try to use informative variable names. 
# Now you try: Assign 22 to a variable named autosomes
autosomes <- 22

# Now you try: Print out the variable you just created
autosomes

# Now you try: Add the two variables that you've assigned together (a and autosomes)
autosomes + a

# Now we'll store the sum of the two variables into a new variable called chromosomes
chromosomes <- a + autosomes


# Data types --------------------------------------------------------------

# There are several different "types" in R
b <- 'chromosomes'

# Print out the variable b, notice that it's surrounded by quotes
b

# Notice what happens if you try to add incompatible types
a + b

# Most common include integer, character, double, logical, and list
my_double <- 12.5

# Logicals (aka booleans) can be TRUE or FALSE. Can be abbreviated to T or F
my_logical <- TRUE
my_logical2 <- T
my_logical3 <- F

# TRUE can act as 1 and FALSE can act as 0
a + my_logical

# Check types of variables, using the function typeof
typeof(a)

typeof(b)

# Now you try: what is the type of my_double and my_logical
typeof(my_logical)

# Learn more about the function typeof with ?typeof
# R help has similar structure for most functions: 
# Description, Usage, Arguments, Value, See Also, Examples
?typeof

# Let's get an example of the function typeof:
example(typeof)

# Variables in R are case sensitive
B <- 'CHROMOSOMES'

# Check whether variables b and B are the same
b == B

# There are several comparison tools: <, >, <=, >=, ==, and !=
# Characters are ordered lexicographically 
B > b


# Vector sandbox --------------------------------------------

# The function c let's you combine elements together
number_vector <- c(1, 2, 3)
character_vector <- c('a', 'b', 'c')

# Now you try: make variable logical_vector, with values = TRUE, FALSE, and TRUE (in that order)
logical_vector <- c(TRUE, FALSE, TRUE)
logical_vector2 <- c(T, F, T)
sum(logical_vector)

# Check comprehension: what's going on here?
hodgepodge_vector <- c(number_vector, character_vector)

# Bonus: what's going on here?
c(a, b, c)

# We're going to use some often-used functions to understand how to do more useful data analysis
# See reference card: https://cran.r-project.org/doc/contrib/Short-refcard.pdf
# Fill a chromosome vector of lenth 22 with values from a random normal distribution
# Mean = 0, var = 1
chromosome_vector <- rnorm(22)

# Print the vector and assume values correspond to each chromosome
chromosome_vector

# Name the vector to keep track of which value corresponds to which
names(chromosome_vector) <- paste0('chr', 1:22)

# Now let's take another look at what our named vector looks like
chromosome_vector

# Make another random chromosome vector
chromosome_vector2 <- rnorm(22)

# Name new chromosome vector with same names as old vector
names(chromosome_vector2) <- names(chromosome_vector)

# Now you try: how do you check if the values of each chromosome vector are the same?
chromosome_vector == chromosome_vector2

# Let's calculate the sum over all chromosomes
sum(chromosome_vector)
sum(chromosome_vector2)

my_vector <- rnorm(100000)

# Is the first vector's sum larger than the second?
sum(chromosome_vector) > sum(chromosome_vector2)

# What's the value on chromosome 6
chromosome_vector[6]
# Alternatively
chromosome_vector['chr6']

# What are the values are for the last chromosome
chromosome_vector[length(chromosome_vector)]
chromosome_vector['chr22']
chromosome_vector[22]

# What the values are for the first 4 chromosomes
chromosome_vector[1:4]

# Now you try: what are the values for chromosomes 1, 5, and 9 (in one command)?
chromosome_vector[c(1,5,9)]
sum(chromosome_vector[c(1,5,9)])
chromosome_vector[sum(c(1,5,9))]

# Get some statistics about the chromosome vector
mean(chromosome_vector)
min(chromosome_vector)
max(chromosome_vector)

# A useful function to look at these all at once
summary(chromosome_vector)
summary(my_logical)
summary(logical_vector)

# Let's quickly lok at the chromosome vector without printing the whole thing
str(chromosome_vector)

# Create a scatter plot with R
plot(chromosome_vector)


# Matrices ----------------------------------------------------------------

# A matrix has all the same type (e.g. numeric, character, logical)
# This matrix is filled rowwise and has two columns
number_matrix <- matrix(1:26, byrow=T, ncol=2)

# Now you try, make a matrix called letter_matrix with 2 columns of the full alphabet, filled columnwise. 
# Hint: try using the letters function


# Now let's say we have the following vectors
vecA <- 1:5
vecB <- rnorm(5)
vecC <- letters[1:5]

# Try putting these vectors together
my_matrix <- matrix(c(vecA, vecB, vecC), ncol=3)

# Note that a number can be a character, but a character isn't a number
as.numeric('a')

# Characters don't give us very useful data summaries
summary(my_matrix)

# We can access just the numeric parts though
# First number is rows, 2nd is columns, separated by a comma. 
# If a number is left out, you're asking for all values
my_matrix_topcols <- matrix(as.numeric(my_matrix2[,1:2]), ncol=2)
my_matrix_toprows <- my_matrix2[1:2,]

# Check comprehension: why was it so complicated to make my_matrix_topcols?

# Give the matrix some more interesting column names
colnames(my_matrix_topcols) <- c('integers', 'random')

# Now you try: what are the min, max, and mean of both columns of my_matrix_topcols (in one command)?


# Get the column sums and column means of my_matrix_topcols
colSums(my_matrix_topcols)
colMeans(my_matrix_topcols)

# You can also modify every element of a matrix
5 * my_matrix_topcols

# Add another numeric column with 5 random values from 1 to 100 to my_matrix_topcols
cbind(my_matrix_topcols, round(runif(5, min=1, max=100)))


# Factors -----------------------------------------------------------------

# Assign a vector of study genders
gender <- c('Male', 'Female', 'Male', 'Male', 'Female', 'Female')

# Summarize the gender vector
summary(gender)

# Hmm... that's not very helpful. Let's try converting it to a factor
gender_factor <- factor(gender)
summary(gender_factor)

# Create a factor vector of temperatures
temperature <- factor(c("High", "Low", "High","Low", "Medium"))
temperature

# Hmm... these aren't in logical order
temperature_order <- factor(temperature, levels=c('Low', 'Medium', 'High'))

# We might have data coded slightly differently than we want
survey_vector <- factor(c("M", "F", "F", "M", "M"))

# Change the name of the factor levels
levels(survey_vector) <- c('Female', 'Male')

# Check comprehension: why did I list female first when relabeling the factor levels?


# The levels of the factor are important, because they can indicate the order of data
speed <- factor(c("fast", "slow", "slow", "fast", "insane"), ordered=T, levels=c('slow', 'fast', 'insane'))

summary(speed)


# Data frames -------------------------------------------------------------

# This is the data type you'll use most often. R has some example data that we'll consider
mtcars

# Just look at the top of mtcars
head(mtcars)

# Get a brief description of mtcars
str(mtcars)

plot(mtcars[,1], mtcars[,2])
plot(mtcars[,'mpg'], mtcars[,'cyl'])
plot(mtcars$mpg, mtcars$cyl)

# Let's define some planet-related vectors
name <- c("Mercury", "Venus", "Earth", "Mars", "Jupiter", "Saturn", "Uranus", "Neptune")
type <- c("Terrestrial planet", "Terrestrial planet", "Terrestrial planet", 
          "Terrestrial planet", "Gas giant", "Gas giant", "Gas giant", "Gas giant")
diameter <- c(0.382, 0.949, 1, 0.532, 11.209, 9.449, 4.007, 3.883)
rotation <- c(58.64, -243.02, 1, 1.03, 0.41, 0.43, -0.72, 0.67)
rings <- c(FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE)

# Create a data frame from the vectors
planets_df <- data.frame(name=name, type=type, diameter=diameter, rotation=rotation, rings=rings)

planets_df <- data.frame(name=c("Mercury", "Venus", "Earth", "Mars", "Jupiter", "Saturn", "Uranus", "Neptune"), 
                         type=type, diameter=diameter, rotation=rotation, rings=rings)

data.frame(name, type, diameter, rotation, rings)


# Check the structure of this data frame
str(planets_df)

# Now you try: print out diameter of Mercury (row 1, column 3)
planets_df[1,3]
planets_df[1,'diameter']

# Now you try: print out data for Mars (entire fourth row)
planets_df[4,]

# Make of table of planet rings by type
table(planets_df$type, planets_df$rings)

# Select all planets with rings
planets_df[rings,]
planets_df[!rings,]

# Now you try: what is the mean diameter of planets with rings
mean(planets_df[rings,'diameter'])
my_type <- planets_df$type == 'Gas giant'


# Subset is a useful command for filtering data frames
subset(planets_df, diameter < 1)

# Order can give you a vector of relative value orders
order(planets_df[,'diameter'])

# You can access columns of data frames with a $ symbol
planets_df$diameter

# Reorder data frame by value of diameter
planets_df_order <- planets_df[order(planets_df$diameter),]


# Some data of our own ----------------------------------------------------

# Download this file from the 1000 Genomes Project: https://personal.broadinstitute.org/armartin/tgp_admixture/tutorial/integrated_call_samples_v3.20130502.ALL.panel

# Set a directory on your own computer where you downloaded this data

# setwd('/Users/alicia/daly_lab/Travel/GINGER/teaching/R_tutorial')
setwd('/Users/alicia/Desktop/')

# load this as a data frame

tgp_data <- read.table('integrated_call_samples_v3.20130502.ALL.panel.txt', header=T)

# look at the data 
head(tgp_data)

# get a data summary
summary(tgp_data)
str(tgp_data)

# Now you try: how many males and females are there?


# Now you try: what are the different super populations, and how many are there?


# Load PCA data for these individuals
tgp_pca <- read.table('ALL.phase3_shapeit2_snps_integrated_v5.20130502.genotypes.maf05.geno1.ld50.out.evec.txt')

# Install a library called dplyr, which helps a ton with data processing/manipulation
install.packages(c('dplyr', 'tidyr', 'ggplot2'))

# Now load dplyr so we can use it. Note: we usually load libraries at the top of scripts
# There is a ton of functionality in this package.
# See cheat sheet here: https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf
library(plyr)
library(dplyr)
library(tidyr)
library(ggplot2)

head(tgp_pca)

# Look at the packages tab of your bottom right window
# The %>% function acts as a "pipe" (equivalent to | in bash). 
# This means, take the data I passed on, do something to it, and continue pass it on to another function
# It takes some time to wrap your head around this, but is really powerful for data manipulation once you get the hang of it
# Note that if you're piping sequences of commands together, the first argument is assumed from the pipe
tgp_pca %>%
  head()

# Separate the first column
tgp_pca %>%
  separate(V1, c('Pop', 'Ind')) %>%
  head()

new_var <- tgp_pca %>%
  separate(V1, c('Pop', 'Ind'))

ggplot(new_var, aes(x=V2, y=V3, color=Pop)) +
  geom_point()

my_merge <- merge(tgp_data, new_var, by.x='sample', by.y='Ind')

ggplot(my_merge, aes(x=V2, y=V3, color=super_pop)) +
  geom_point() +
  xlab('PC1') +
  ylab('PC2') +
  ggtitle('my pca plot')

ggplot(my_merge, aes(x=V4, y=V5, color=super_pop)) +
  geom_point()
  

# Rename columns
tgp_pca %>%
  rename(Ind=V1, PC1=V2, PC2=V3, PC3=V4, PC4=V5, PC5=V6, Pheno=V7) %>%
  head()

# Separate first column, rename columns, and store this back into the same variable name
tgp_pca <- tgp_pca %>%
  separate(V1, c('Pop', 'Ind')) %>%
  rename(PC1=V2, PC2=V3, PC3=V4, PC4=V5, PC5=V6, Pheno=V7)

# Check comprehension: why didn't we set V1 in the rename?


# Now let's merge the PCA data and the overall data info
tgp_merge <- merge(tgp_data, tgp_pca, by.x='sample', by.y='Ind')

# Now we have two columns with the same values (pop and Pop). Double check to be sure
sum(tgp_merge$pop!=tgp_merge$Pop)

# You can merge on multiple columns (this gives only one pop column)
tgp_merge <- merge(tgp_data, tgp_pca, by.x=c('sample', 'pop'), by.y=c('Ind', 'Pop'))
  
# Let's try plotting some of the PCs!
plot(tgp_merge$PC1, tgp_merge$PC2)

# Hmm, maybe color by population will help?
plot(tgp_merge$PC1, tgp_merge$PC2, col=tgp_merge$super_pop)

# This plot would look nicer and be interpreted with a little work in ggplot
# First we have to tell ggplot where the data is coming from
# Then we have to tell it which aesthetics (aes) we want to use
# Then we tell it what kind of plot to make
pca_plot <- ggplot(tgp_merge, aes(x=PC1, y=PC2, color=super_pop)) +
  geom_point()

# The grey background is kind of ugly. Note that you can keep stringing together plotting commands with +
pca_plot + theme_bw()

# Try out different coloring schemes
# RColorBrewer has some nice ones that you can explore here: http://colorbrewer2.org/
pca_plot + theme_bw() + scale_color_brewer(palette='Set1')

# Now you try: make a PCA plot of PCs 3 and 4, coloring by population


# You can make many different kinds of plots very easily in ggplot.
# Try exploring with: https://www.rstudio.com/wp-content/uploads/2015/03/ggplot2-cheatsheet.pdf