Data Manipulation

Rodrigo Theodoro Rocha

11 de Outubro 2018

Introduction

Data transformation

Clearing concepts (Names)

Day Sample 1 Sample 2 Sample 3
1 100 120 89
2 50 30 55
3 25 10
day <- c(1, 5, 10)
sample1 <- c(100, 50, 25)
sample2 <- c(120, 30, 10)
sample3 <- c(89, 55, NA)

dfmessyWrongName <- data.frame("Day"=day, "Sample 1"=sample1, "Sample 2"=sample2, "Sample 3"=sample3)
Day Sample.1 Sample.2 Sample.3
1 100 120 89
5 50 30 55
10 25 10 NA

Clearing concepts (Names)

Day Sample 1 Sample 2 Sample 3
1 100 120 89
2 50 30 55
3 25 10
dfmessy <- data.frame("Day"=day, "Sample_1"=sample1, "Sample_2"=sample2, "Sample_3"=sample3)
Names with non-space characters.
Day Sample_1 Sample_2 Sample_3
1 100 120 89
5 50 30 55
10 25 10 NA

Question

Could you make a plot with Day in x-axis and all Samples measurements in y-axis?

Solution?

plot(dfmessy$Day, dfmessy$Sample_1, xlab='Day', ylab='Measurement (mg)', col='red',pch=19)
points(dfmessy$Day, dfmessy$Sample_2, col='black',pch=19)
points(dfmessy$Day, dfmessy$Sample_3, col='green',pch=19)

Clearing concepts (Messy vs Tidy)

Clearing concepts (ex Tidy)

How to Tidy Data

Day Sample_1 Sample_2 Sample_3
1 100 120 89
5 50 30 55
10 25 10 NA

Gathering

## Warning: package 'nycflights13' was built under R version 3.4.4
library(tidyr)
gather(dfmessy, Sample_1, Sample_2, Sample_3, key = 'Samples', value = 'Measurements')
##   Day  Samples Measurements
## 1   1 Sample_1          100
## 2   5 Sample_1           50
## 3  10 Sample_1           25
## 4   1 Sample_2          120
## 5   5 Sample_2           30
## 6  10 Sample_2           10
## 7   1 Sample_3           89
## 8   5 Sample_3           55
## 9  10 Sample_3           NA

Better Solution

plot(dftidy$Day, dftidy$measurement, xlab='Day', ylab='Measurement (mg)',col=as.factor(dftidy$samples), pch=19)

When things become interesting

library(readr)
original_data <- read_delim("http://varianceexplained.org/files/Brauer2008_DataSet1.tds", delim = "\t")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   GID = col_character(),
##   YORF = col_character(),
##   NAME = col_character(),
##   GWEIGHT = col_integer()
## )
## See spec(...) for full column specifications.
View(head(original_data))

Exploring Expression Dataset

Multiple Variables stored in one Column

original_data$NAME[1:3]
## [1] "SFB2       || ER to Golgi transport || molecular function unknown || YNL049C || 1082129"          
## [2] "          || biological process unknown || molecular function unknown || YNL095C || 1086222"      
## [3] "QRI7       || proteolysis and peptidolysis || metalloendopeptidase activity || YDL104C || 1085955"

Separate

      separate(<name of the data frame>, <name of the variable (without quotes) to separate>, < vector with new variable names>, sep= separator)
cleaned_data <- separate(original_data, NAME, c("name", "BP", "MF", "systematic_name", "number"), sep="\\|\\|")
DATAFRAME %>% separate( <name of the variable (without quotes) to separate>, < vector with new variable names>, sep= separator)
cleaned_data <- original_data %>% separate(NAME, c("name", "BP", "MF", "systematic_name", "number"), sep="\\|\\|")

The pipe: %>%

Selecting columns with select()

      select(<name of the data frame>, <name of columns>)
# Select columns by name: name, BP, MF, systematic_name

select(cleaned_data, name, BP, MF, systematic_name)

# Select all columns between name and systematic_name (inclusive)

select(cleaned_data, name:systematic_name)

# Select all columns except number and GWEIGHT

select(cleaned_data, -number, -GWEIGHT)

Exercise

  1. Create a new dataframe from cleaned_data with all variables except number, GID, YORF and GWEIGHT. Save it in a new object called cleaned_data_new.

Solution

(cleaned_data_new <- cleaned_data %>% select(-number, -GID, -YORF, -GWEIGHT))
## # A tibble: 5,537 x 40
##    name  BP    MF    systematic_name G0.05  G0.1 G0.15  G0.2 G0.25  G0.3
##    <chr> <chr> <chr> <chr>           <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 "SFB… " ER… " mo… " YNL049C "     -0.24 -0.13 -0.21 -0.15 -0.05 -0.05
##  2 "   … " bi… " mo… " YNL095C "      0.28  0.13 -0.4  -0.48 -0.11  0.17
##  3 "QRI… " pr… " me… " YDL104C "     -0.02 -0.27 -0.27 -0.02  0.24  0.25
##  4 "CFT… " mR… " RN… " YLR115W "     -0.33 -0.41 -0.24 -0.03 -0.03  0   
##  5 "SSO… " ve… " t-… " YMR183C "      0.05  0.02  0.4   0.34 -0.13 -0.14
##  6 "PSP… " bi… " mo… " YML017W "     -0.69 -0.03  0.23  0.2   0    -0.27
##  7 "RIB… " ri… " ps… " YOL066C "     -0.55 -0.3  -0.12 -0.03 -0.16 -0.11
##  8 "VMA… " va… " hy… " YPR036W "     -0.75 -0.12 -0.07  0.02 -0.32 -0.41
##  9 "EDC… " de… " mo… " YEL015W "     -0.24 -0.22  0.14  0.06  0    -0.13
## 10 "VPS… " pr… " pr… " YOR069W "     -0.16 -0.38  0.05  0.14 -0.04 -0.01
## # ... with 5,527 more rows, and 30 more variables: N0.05 <dbl>,
## #   N0.1 <dbl>, N0.15 <dbl>, N0.2 <dbl>, N0.25 <dbl>, N0.3 <dbl>,
## #   P0.05 <dbl>, P0.1 <dbl>, P0.15 <dbl>, P0.2 <dbl>, P0.25 <dbl>,
## #   P0.3 <dbl>, S0.05 <dbl>, S0.1 <dbl>, S0.15 <dbl>, S0.2 <dbl>,
## #   S0.25 <dbl>, S0.3 <dbl>, L0.05 <dbl>, L0.1 <dbl>, L0.15 <dbl>,
## #   L0.2 <dbl>, L0.25 <dbl>, L0.3 <dbl>, U0.05 <dbl>, U0.1 <dbl>,
## #   U0.15 <dbl>, U0.2 <dbl>, U0.25 <dbl>, U0.3 <dbl>

Organize our data

Gathering

  1. Identify the variables which must be gathered resulting in a tidy data frame. Tip: select columns between with operator :
gather( <put variables here> , key="sample", value="expression")

Gathering all together

cleaned_data <- original_data %>%
                separate(NAME, c("name", "BP", "MF", "systematic_name", "number"), sep="\\|\\|") %>%
                select(-number, -GID, -YORF, -GWEIGHT) %>%
                gather(G0.05:U0.3, key="sample", value="expression")
cleaned_data
## # A tibble: 199,332 x 6
##    name    BP           MF               systematic_name sample expression
##    <chr>   <chr>        <chr>            <chr>           <chr>       <dbl>
##  1 "SFB2 … " ER to Gol… " molecular fun… " YNL049C "     G0.05       -0.24
##  2 "     … " biologica… " molecular fun… " YNL095C "     G0.05        0.28
##  3 "QRI7 … " proteolys… " metalloendope… " YDL104C "     G0.05       -0.02
##  4 "CFT2 … " mRNA poly… " RNA binding "  " YLR115W "     G0.05       -0.33
##  5 "SSO2 … " vesicle f… " t-SNARE activ… " YMR183C "     G0.05        0.05
##  6 "PSP2 … " biologica… " molecular fun… " YML017W "     G0.05       -0.69
##  7 "RIB2 … " riboflavi… " pseudouridyla… " YOL066C "     G0.05       -0.55
##  8 "VMA13… " vacuolar … " hydrogen-tran… " YPR036W "     G0.05       -0.75
##  9 "EDC3 … " deadenyly… " molecular fun… " YEL015W "     G0.05       -0.24
## 10 "VPS5 … " protein r… " protein trans… " YOR069W "     G0.05       -0.16
## # ... with 199,322 more rows
name BP MF systematic_name sample expression
SFB2 ER to Golgi transport molecular function unknown YNL049C G0.05 -0.24
biological process unknown molecular function unknown YNL095C G0.05 0.28
QRI7 proteolysis and peptidolysis metalloendopeptidase activity YDL104C G0.05 -0.02
CFT2 mRNA polyadenylylation* RNA binding YLR115W G0.05 -0.33
SSO2 vesicle fusion* t-SNARE activity YMR183C G0.05 0.05
PSP2 biological process unknown molecular function unknown YML017W G0.05 -0.69

Rename a column

rename(cleaned_data, gene_name = name)
## # A tibble: 199,332 x 6
##    gene_name  BP          MF             systematic_name sample expression
##    <chr>      <chr>       <chr>          <chr>           <chr>       <dbl>
##  1 "SFB2    … " ER to Go… " molecular f… " YNL049C "     G0.05       -0.24
##  2 "        … " biologic… " molecular f… " YNL095C "     G0.05        0.28
##  3 "QRI7    … " proteoly… " metalloendo… " YDL104C "     G0.05       -0.02
##  4 "CFT2    … " mRNA pol… " RNA binding… " YLR115W "     G0.05       -0.33
##  5 "SSO2    … " vesicle … " t-SNARE act… " YMR183C "     G0.05        0.05
##  6 "PSP2    … " biologic… " molecular f… " YML017W "     G0.05       -0.69
##  7 "RIB2    … " riboflav… " pseudouridy… " YOL066C "     G0.05       -0.55
##  8 "VMA13   … " vacuolar… " hydrogen-tr… " YPR036W "     G0.05       -0.75
##  9 "EDC3    … " deadenyl… " molecular f… " YEL015W "     G0.05       -0.24
## 10 "VPS5    … " protein … " protein tra… " YOR069W "     G0.05       -0.16
## # ... with 199,322 more rows

Filter rows with filter()

      filter(<name of the data frame>, <argument(s) that filter the data frame>)
filter(cleaned_data, BP == " mRNA polyadenylylation* ")
## Warning: package 'bindrcpp' was built under R version 3.4.4
## # A tibble: 540 x 6
##    name      BP              MF          systematic_name sample expression
##    <chr>     <chr>           <chr>       <chr>           <chr>       <dbl>
##  1 "CFT2   … " mRNA polyade… " RNA bind… " YLR115W "     G0.05       -0.33
##  2 "HRP1   … " mRNA polyade… " RNA bind… " YOL123W "     G0.05       -0.35
##  3 "MPE1   … " mRNA polyade… " RNA bind… " YKL059C "     G0.05       -0.23
##  4 "PTA1   … " mRNA polyade… " RNA bind… " YAL043C "     G0.05       -0.44
##  5 "YSH1   … " mRNA polyade… " RNA bind… " YLR277C "     G0.05       -0.45
##  6 "CLP1   … " mRNA polyade… " RNA bind… " YOR250C "     G0.05       -0.01
##  7 "FIP1   … " mRNA polyade… " protein … " YJR093C "     G0.05       -0.12
##  8 "RNA15  … " mRNA polyade… " RNA bind… " YGL044C "     G0.05       -0.1 
##  9 "RNA14  … " mRNA polyade… " RNA bind… " YMR061W "     G0.05       -0.13
## 10 "YTH1   … " mRNA polyade… " RNA bind… " YPR107C "     G0.05        0.39
## # ... with 530 more rows

Removing Whitespaces

head(cleaned_data$BP)
## [1] " ER to Golgi transport "        " biological process unknown "  
## [3] " proteolysis and peptidolysis " " mRNA polyadenylylation* "     
## [5] " vesicle fusion* "              " biological process unknown "
cleaned_data <- cleaned_data %>% mutate_each(funs(trimws), name:systematic_name)
## `mutate_each()` is deprecated.
## Use `mutate_all()`, `mutate_at()` or `mutate_if()` instead.
## To map `funs` over a selection of variables, use `mutate_at()`
## [1] "ER to Golgi transport"        "biological process unknown"  
## [3] "proteolysis and peptidolysis" "mRNA polyadenylylation*"     
## [5] "vesicle fusion*"              "biological process unknown"

Back to Filter

filter(cleaned_data, BP=="leucine biosynthesis", expression > 1.0)
## # A tibble: 14 x 6
##    name  BP          MF                  systematic_name sample expression
##    <chr> <chr>       <chr>               <chr>           <chr>       <dbl>
##  1 LEU4  leucine bi… 2-isopropylmalate … YNL104C         G0.05        2.61
##  2 LEU4  leucine bi… 2-isopropylmalate … YNL104C         G0.1         1.53
##  3 LEU4  leucine bi… 2-isopropylmalate … YNL104C         G0.15        1.38
##  4 LEU4  leucine bi… 2-isopropylmalate … YNL104C         G0.2         1.06
##  5 LEU1  leucine bi… 3-isopropylmalate … YGL009C         L0.05        3.84
##  6 LEU2  leucine bi… 3-isopropylmalate … YCL018W         L0.05        1.54
##  7 LEU4  leucine bi… 2-isopropylmalate … YNL104C         L0.05        1.94
##  8 LEU1  leucine bi… 3-isopropylmalate … YGL009C         L0.1         3.36
##  9 LEU2  leucine bi… 3-isopropylmalate … YCL018W         L0.1         1.23
## 10 LEU4  leucine bi… 2-isopropylmalate … YNL104C         L0.1         1.71
## 11 LEU1  leucine bi… 3-isopropylmalate … YGL009C         L0.15        3.24
## 12 LEU4  leucine bi… 2-isopropylmalate … YNL104C         L0.15        1.06
## 13 LEU1  leucine bi… 3-isopropylmalate … YGL009C         L0.2         2.84
## 14 LEU1  leucine bi… 3-isopropylmalate … YGL009C         L0.25        2.04

One more step to clean our data

cleaned_data <- cleaned_data %>% separate(sample, c("nutrient", "rate"), sep=1, convert=TRUE)

head(cleaned_data)
## # A tibble: 6 x 7
##   name  BP           MF          systematic_name nutrient  rate expression
##   <chr> <chr>        <chr>       <chr>           <chr>    <dbl>      <dbl>
## 1 SFB2  ER to Golgi… molecular … YNL049C         G         0.05      -0.24
## 2 ""    biological … molecular … YNL095C         G         0.05       0.28
## 3 QRI7  proteolysis… metalloend… YDL104C         G         0.05      -0.02
## 4 CFT2  mRNA polyad… RNA binding YLR115W         G         0.05      -0.33
## 5 SSO2  vesicle fus… t-SNARE ac… YMR183C         G         0.05       0.05
## 6 PSP2  biological … molecular … YML017W         G         0.05      -0.69

Use Logical Operators

filter(cleaned_data, BP == "mRNA polyadenylylation*" & MF == "RNA binding")
filter(cleaned_data, BP == "mRNA polyadenylylation*" | BP == "RNA binding")

Logical Operators

Missing Values

Adding columns with mutate()

      mutate(<name of the data frame or nothing if using pipe>,        <name of new column>=<operation on existing columns>,        <name of another new column>=<another operation on existing columns>)
cleaned_data %>% mutate(expression_brute = 2**expression) %>% select(expression , expression_brute)
      y <- c(1,2,2,NA,4,3)
      min_rank(y)
## [1]  1  2  2 NA  5  4

Sorting rows with arrange()

  1. In ascending order by one or more columns (variables):
arrange(cleaned_data, expression)
  1. Use desc() to re-order in descending order:
arrange(cleaned_data, desc(expression))

Exercise

  1. Find the 10 most expressed genes when restricted by nutrient leucine (L).

Exercise

  1. Find the 10 most expressed genes when restricted by nutrient leucine (L).
top10 <- cleaned_data %>% filter(nutrient == "L") %>%
                              arrange(desc(expression))

top10_leu <- top10[1:10,]
top10_leu
## # A tibble: 10 x 7
##    name  BP         MF           systematic_name nutrient  rate expression
##    <chr> <chr>      <chr>        <chr>           <chr>    <dbl>      <dbl>
##  1 HXT3  hexose tr… glucose tra… YDR345C         L         0.3        5.36
##  2 HXT3  hexose tr… glucose tra… YDR345C         L         0.2        5.22
##  3 HXT3  hexose tr… glucose tra… YDR345C         L         0.05       5.16
##  4 HXT3  hexose tr… glucose tra… YDR345C         L         0.15       5.02
##  5 HXT3  hexose tr… glucose tra… YDR345C         L         0.25       4.99
##  6 HXT5  hexose tr… glucose tra… YHR096C         L         0.05       4.9 
##  7 HSP26 response … unfolded pr… YBR072W         L         0.05       4.86
##  8 QDR2  multidrug… multidrug e… YIL121W         L         0.1        4.78
##  9 BAP3  amino aci… amino acid … YDR046C         L         0.25       4.65
## 10 BAP3  amino aci… amino acid … YDR046C         L         0.3        4.62

Summarizing data with summarise()

      summarise(<name of the data frame or nothing if using pipe>,        <name of new column>=<summarised value>)
summarise(cleaned_data, mean_expression = mean(expression, na.rm = TRUE))
## # A tibble: 1 x 1
##   mean_expression
##             <dbl>
## 1         0.00328
summarise(cleaned_data, mean_expression = mean(expression, na.rm = TRUE), count = n())
## # A tibble: 1 x 2
##   mean_expression  count
##             <dbl>  <int>
## 1         0.00328 199332
# The exactly above code applied to a data frame grouped by nutrient and rate, 
# we get the average delay per date
cleaned_data %>% 
  group_by(nutrient, rate) %>%
    summarise(mean_expression_by_rate = mean(expression, na.rm = TRUE), count = n())
## # A tibble: 36 x 4
## # Groups:   nutrient [?]
##    nutrient  rate mean_expression_by_rate count
##    <chr>    <dbl>                   <dbl> <int>
##  1 G         0.05                 0.0209   5537
##  2 G         0.1                  0.0171   5537
##  3 G         0.15                 0.0388   5537
##  4 G         0.2                  0.0479   5537
##  5 G         0.25                 0.0145   5537
##  6 G         0.3                 -0.00551  5537
##  7 L         0.05                 0.0257   5537
##  8 L         0.1                  0.0305   5537
##  9 L         0.15                 0.0104   5537
## 10 L         0.2                  0.00216  5537
## # ... with 26 more rows

Summary functions

cleaned_data %>%
  group_by(nutrient, rate) %>% 
    summarise(
      least_expr = min(expression),
      most_expr = max(expression)
    )

Summary functions

cleaned_data %>%
  filter(!is.na(expression)) %>%
  group_by(nutrient, rate) %>% 
    summarise(
      least_expr = min(expression),
      most_expr = max(expression)
    )
cleaned_data %>%
        filter(BP == "leucine biosynthesis") %>%
        ggplot(aes(rate, expression, color = nutrient)) +
        geom_point() +
        geom_line() +
        facet_wrap(~name)

Take home message

Bibliography

http://varianceexplained.org/r/tidy-genomics/

http://r4ds.had.co.nz/transform.html