Rodrigo Theodoro Rocha
11 de Outubro 2018
Organize your dataPick observations and variablesAggregate variablesnew variables or summariesRename the variables or reorder the observationsdplyr makes it easier !| Day | Sample 1 | Sample 2 | Sample 3 |
|---|---|---|---|
| 1 | 100 | 120 | 89 |
| 2 | 50 | 30 | 55 |
| 3 | 25 | 10 |
day <- c(1, 5, 10)
sample1 <- c(100, 50, 25)
sample2 <- c(120, 30, 10)
sample3 <- c(89, 55, NA)
dfmessyWrongName <- data.frame("Day"=day, "Sample 1"=sample1, "Sample 2"=sample2, "Sample 3"=sample3)| Day | Sample.1 | Sample.2 | Sample.3 |
|---|---|---|---|
| 1 | 100 | 120 | 89 |
| 5 | 50 | 30 | 55 |
| 10 | 25 | 10 | NA |
| Day | Sample 1 | Sample 2 | Sample 3 |
|---|---|---|---|
| 1 | 100 | 120 | 89 |
| 2 | 50 | 30 | 55 |
| 3 | 25 | 10 |
dfmessy <- data.frame("Day"=day, "Sample_1"=sample1, "Sample_2"=sample2, "Sample_3"=sample3)| Day | Sample_1 | Sample_2 | Sample_3 |
|---|---|---|---|
| 1 | 100 | 120 | 89 |
| 5 | 50 | 30 | 55 |
| 10 | 25 | 10 | NA |
Could you make a plot with Day in x-axis and all Samples measurements in y-axis?
plot(dfmessy$Day, dfmessy$Sample_1, xlab='Day', ylab='Measurement (mg)', col='red',pch=19)
points(dfmessy$Day, dfmessy$Sample_2, col='black',pch=19)
points(dfmessy$Day, dfmessy$Sample_3, col='green',pch=19)tidyr package.What are our variables ?
What are our observations ?
| Day | Sample_1 | Sample_2 | Sample_3 |
|---|---|---|---|
| 1 | 100 | 120 | 89 |
| 5 | 50 | 30 | 55 |
| 10 | 25 | 10 | NA |
## Warning: package 'nycflights13' was built under R version 3.4.4
library(tidyr)gather(dfmessy, Sample_1, Sample_2, Sample_3, key = 'Samples', value = 'Measurements')## Day Samples Measurements
## 1 1 Sample_1 100
## 2 5 Sample_1 50
## 3 10 Sample_1 25
## 4 1 Sample_2 120
## 5 5 Sample_2 30
## 6 10 Sample_2 10
## 7 1 Sample_3 89
## 8 5 Sample_3 55
## 9 10 Sample_3 NA
plot(dftidy$Day, dftidy$measurement, xlab='Day', ylab='Measurement (mg)',col=as.factor(dftidy$samples), pch=19)library(readr)
original_data <- read_delim("http://varianceexplained.org/files/Brauer2008_DataSet1.tds", delim = "\t")## Parsed with column specification:
## cols(
## .default = col_double(),
## GID = col_character(),
## YORF = col_character(),
## NAME = col_character(),
## GWEIGHT = col_integer()
## )
## See spec(...) for full column specifications.
View(head(original_data))original_data$NAME[1:3]## [1] "SFB2 || ER to Golgi transport || molecular function unknown || YNL049C || 1082129"
## [2] " || biological process unknown || molecular function unknown || YNL095C || 1086222"
## [3] "QRI7 || proteolysis and peptidolysis || metalloendopeptidase activity || YDL104C || 1085955"
NAME column contains lots of information
cleaned_data <- separate(original_data, NAME, c("name", "BP", "MF", "systematic_name", "number"), sep="\\|\\|")DATAFRAME %>% separate( <name of the variable (without quotes) to separate>, < vector with new variable names>, sep= separator)
cleaned_data <- original_data %>% separate(NAME, c("name", "BP", "MF", "systematic_name", "number"), sep="\\|\\|")dplyr functions never modify their inputs%>%Free us from creating intermediate objects
Useful to concatenate multiple operations
select()Allows you to select specific variables (columns)
Syntax:
# Select columns by name: name, BP, MF, systematic_name
select(cleaned_data, name, BP, MF, systematic_name)
# Select all columns between name and systematic_name (inclusive)
select(cleaned_data, name:systematic_name)
# Select all columns except number and GWEIGHT
select(cleaned_data, -number, -GWEIGHT)cleaned_data with all variables except number, GID, YORF and GWEIGHT. Save it in a new object called cleaned_data_new.(cleaned_data_new <- cleaned_data %>% select(-number, -GID, -YORF, -GWEIGHT))## # A tibble: 5,537 x 40
## name BP MF systematic_name G0.05 G0.1 G0.15 G0.2 G0.25 G0.3
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 "SFB… " ER… " mo… " YNL049C " -0.24 -0.13 -0.21 -0.15 -0.05 -0.05
## 2 " … " bi… " mo… " YNL095C " 0.28 0.13 -0.4 -0.48 -0.11 0.17
## 3 "QRI… " pr… " me… " YDL104C " -0.02 -0.27 -0.27 -0.02 0.24 0.25
## 4 "CFT… " mR… " RN… " YLR115W " -0.33 -0.41 -0.24 -0.03 -0.03 0
## 5 "SSO… " ve… " t-… " YMR183C " 0.05 0.02 0.4 0.34 -0.13 -0.14
## 6 "PSP… " bi… " mo… " YML017W " -0.69 -0.03 0.23 0.2 0 -0.27
## 7 "RIB… " ri… " ps… " YOL066C " -0.55 -0.3 -0.12 -0.03 -0.16 -0.11
## 8 "VMA… " va… " hy… " YPR036W " -0.75 -0.12 -0.07 0.02 -0.32 -0.41
## 9 "EDC… " de… " mo… " YEL015W " -0.24 -0.22 0.14 0.06 0 -0.13
## 10 "VPS… " pr… " pr… " YOR069W " -0.16 -0.38 0.05 0.14 -0.04 -0.01
## # ... with 5,527 more rows, and 30 more variables: N0.05 <dbl>,
## # N0.1 <dbl>, N0.15 <dbl>, N0.2 <dbl>, N0.25 <dbl>, N0.3 <dbl>,
## # P0.05 <dbl>, P0.1 <dbl>, P0.15 <dbl>, P0.2 <dbl>, P0.25 <dbl>,
## # P0.3 <dbl>, S0.05 <dbl>, S0.1 <dbl>, S0.15 <dbl>, S0.2 <dbl>,
## # S0.25 <dbl>, S0.3 <dbl>, L0.05 <dbl>, L0.1 <dbl>, L0.15 <dbl>,
## # L0.2 <dbl>, L0.25 <dbl>, L0.3 <dbl>, U0.05 <dbl>, U0.1 <dbl>,
## # U0.15 <dbl>, U0.2 <dbl>, U0.25 <dbl>, U0.3 <dbl>
:gather( <put variables here> , key="sample", value="expression")cleaned_data <- original_data %>%
separate(NAME, c("name", "BP", "MF", "systematic_name", "number"), sep="\\|\\|") %>%
select(-number, -GID, -YORF, -GWEIGHT) %>%
gather(G0.05:U0.3, key="sample", value="expression")
cleaned_data## # A tibble: 199,332 x 6
## name BP MF systematic_name sample expression
## <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 "SFB2 … " ER to Gol… " molecular fun… " YNL049C " G0.05 -0.24
## 2 " … " biologica… " molecular fun… " YNL095C " G0.05 0.28
## 3 "QRI7 … " proteolys… " metalloendope… " YDL104C " G0.05 -0.02
## 4 "CFT2 … " mRNA poly… " RNA binding " " YLR115W " G0.05 -0.33
## 5 "SSO2 … " vesicle f… " t-SNARE activ… " YMR183C " G0.05 0.05
## 6 "PSP2 … " biologica… " molecular fun… " YML017W " G0.05 -0.69
## 7 "RIB2 … " riboflavi… " pseudouridyla… " YOL066C " G0.05 -0.55
## 8 "VMA13… " vacuolar … " hydrogen-tran… " YPR036W " G0.05 -0.75
## 9 "EDC3 … " deadenyly… " molecular fun… " YEL015W " G0.05 -0.24
## 10 "VPS5 … " protein r… " protein trans… " YOR069W " G0.05 -0.16
## # ... with 199,322 more rows
| name | BP | MF | systematic_name | sample | expression |
|---|---|---|---|---|---|
| SFB2 | ER to Golgi transport | molecular function unknown | YNL049C | G0.05 | -0.24 |
| biological process unknown | molecular function unknown | YNL095C | G0.05 | 0.28 | |
| QRI7 | proteolysis and peptidolysis | metalloendopeptidase activity | YDL104C | G0.05 | -0.02 |
| CFT2 | mRNA polyadenylylation* | RNA binding | YLR115W | G0.05 | -0.33 |
| SSO2 | vesicle fusion* | t-SNARE activity | YMR183C | G0.05 | 0.05 |
| PSP2 | biological process unknown | molecular function unknown | YML017W | G0.05 | -0.69 |
rename(cleaned_data, gene_name = name)## # A tibble: 199,332 x 6
## gene_name BP MF systematic_name sample expression
## <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 "SFB2 … " ER to Go… " molecular f… " YNL049C " G0.05 -0.24
## 2 " … " biologic… " molecular f… " YNL095C " G0.05 0.28
## 3 "QRI7 … " proteoly… " metalloendo… " YDL104C " G0.05 -0.02
## 4 "CFT2 … " mRNA pol… " RNA binding… " YLR115W " G0.05 -0.33
## 5 "SSO2 … " vesicle … " t-SNARE act… " YMR183C " G0.05 0.05
## 6 "PSP2 … " biologic… " molecular f… " YML017W " G0.05 -0.69
## 7 "RIB2 … " riboflav… " pseudouridy… " YOL066C " G0.05 -0.55
## 8 "VMA13 … " vacuolar… " hydrogen-tr… " YPR036W " G0.05 -0.75
## 9 "EDC3 … " deadenyl… " molecular f… " YEL015W " G0.05 -0.24
## 10 "VPS5 … " protein … " protein tra… " YOR069W " G0.05 -0.16
## # ... with 199,322 more rows
filter()filter(cleaned_data, BP == " mRNA polyadenylylation* ")## Warning: package 'bindrcpp' was built under R version 3.4.4
## # A tibble: 540 x 6
## name BP MF systematic_name sample expression
## <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 "CFT2 … " mRNA polyade… " RNA bind… " YLR115W " G0.05 -0.33
## 2 "HRP1 … " mRNA polyade… " RNA bind… " YOL123W " G0.05 -0.35
## 3 "MPE1 … " mRNA polyade… " RNA bind… " YKL059C " G0.05 -0.23
## 4 "PTA1 … " mRNA polyade… " RNA bind… " YAL043C " G0.05 -0.44
## 5 "YSH1 … " mRNA polyade… " RNA bind… " YLR277C " G0.05 -0.45
## 6 "CLP1 … " mRNA polyade… " RNA bind… " YOR250C " G0.05 -0.01
## 7 "FIP1 … " mRNA polyade… " protein … " YJR093C " G0.05 -0.12
## 8 "RNA15 … " mRNA polyade… " RNA bind… " YGL044C " G0.05 -0.1
## 9 "RNA14 … " mRNA polyade… " RNA bind… " YMR061W " G0.05 -0.13
## 10 "YTH1 … " mRNA polyade… " RNA bind… " YPR107C " G0.05 0.39
## # ... with 530 more rows
head(cleaned_data$BP)## [1] " ER to Golgi transport " " biological process unknown "
## [3] " proteolysis and peptidolysis " " mRNA polyadenylylation* "
## [5] " vesicle fusion* " " biological process unknown "
cleaned_data <- cleaned_data %>% mutate_each(funs(trimws), name:systematic_name)## `mutate_each()` is deprecated.
## Use `mutate_all()`, `mutate_at()` or `mutate_if()` instead.
## To map `funs` over a selection of variables, use `mutate_at()`
## [1] "ER to Golgi transport" "biological process unknown"
## [3] "proteolysis and peptidolysis" "mRNA polyadenylylation*"
## [5] "vesicle fusion*" "biological process unknown"
filter(cleaned_data, BP=="leucine biosynthesis", expression > 1.0)## # A tibble: 14 x 6
## name BP MF systematic_name sample expression
## <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 LEU4 leucine bi… 2-isopropylmalate … YNL104C G0.05 2.61
## 2 LEU4 leucine bi… 2-isopropylmalate … YNL104C G0.1 1.53
## 3 LEU4 leucine bi… 2-isopropylmalate … YNL104C G0.15 1.38
## 4 LEU4 leucine bi… 2-isopropylmalate … YNL104C G0.2 1.06
## 5 LEU1 leucine bi… 3-isopropylmalate … YGL009C L0.05 3.84
## 6 LEU2 leucine bi… 3-isopropylmalate … YCL018W L0.05 1.54
## 7 LEU4 leucine bi… 2-isopropylmalate … YNL104C L0.05 1.94
## 8 LEU1 leucine bi… 3-isopropylmalate … YGL009C L0.1 3.36
## 9 LEU2 leucine bi… 3-isopropylmalate … YCL018W L0.1 1.23
## 10 LEU4 leucine bi… 2-isopropylmalate … YNL104C L0.1 1.71
## 11 LEU1 leucine bi… 3-isopropylmalate … YGL009C L0.15 3.24
## 12 LEU4 leucine bi… 2-isopropylmalate … YNL104C L0.15 1.06
## 13 LEU1 leucine bi… 3-isopropylmalate … YGL009C L0.2 2.84
## 14 LEU1 leucine bi… 3-isopropylmalate … YGL009C L0.25 2.04
cleaned_data <- cleaned_data %>% separate(sample, c("nutrient", "rate"), sep=1, convert=TRUE)
head(cleaned_data)## # A tibble: 6 x 7
## name BP MF systematic_name nutrient rate expression
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 SFB2 ER to Golgi… molecular … YNL049C G 0.05 -0.24
## 2 "" biological … molecular … YNL095C G 0.05 0.28
## 3 QRI7 proteolysis… metalloend… YDL104C G 0.05 -0.02
## 4 CFT2 mRNA polyad… RNA binding YLR115W G 0.05 -0.33
## 5 SSO2 vesicle fus… t-SNARE ac… YMR183C G 0.05 0.05
## 6 PSP2 biological … molecular … YML017W G 0.05 -0.69
mRNA polyadenylylation* and have RNA binding activity?filter(cleaned_data, BP == "mRNA polyadenylylation*" & MF == "RNA binding")filter(cleaned_data, BP == "mRNA polyadenylylation*" | BP == "RNA binding")NASuppose we want to calculate the mean expression of all genes
> mean(cleaned_data$expression)
NAs
is.na(expression) )mutate()cleaned_data %>% mutate(expression_brute = 2**expression) %>% select(expression , expression_brute)+, -, /, *log2(), log10() y <- c(1,2,2,NA,4,3)
min_rank(y)## [1] 1 2 2 NA 5 4
arrange()arrange(cleaned_data, expression)desc() to re-order in descending order:arrange(cleaned_data, desc(expression))top10 <- cleaned_data %>% filter(nutrient == "L") %>%
arrange(desc(expression))
top10_leu <- top10[1:10,]
top10_leu## # A tibble: 10 x 7
## name BP MF systematic_name nutrient rate expression
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 HXT3 hexose tr… glucose tra… YDR345C L 0.3 5.36
## 2 HXT3 hexose tr… glucose tra… YDR345C L 0.2 5.22
## 3 HXT3 hexose tr… glucose tra… YDR345C L 0.05 5.16
## 4 HXT3 hexose tr… glucose tra… YDR345C L 0.15 5.02
## 5 HXT3 hexose tr… glucose tra… YDR345C L 0.25 4.99
## 6 HXT5 hexose tr… glucose tra… YHR096C L 0.05 4.9
## 7 HSP26 response … unfolded pr… YBR072W L 0.05 4.86
## 8 QDR2 multidrug… multidrug e… YIL121W L 0.1 4.78
## 9 BAP3 amino aci… amino acid … YDR046C L 0.25 4.65
## 10 BAP3 amino aci… amino acid … YDR046C L 0.3 4.62
summarise()summarise(cleaned_data, mean_expression = mean(expression, na.rm = TRUE))## # A tibble: 1 x 1
## mean_expression
## <dbl>
## 1 0.00328
n() to count the number of observationssummarise(cleaned_data, mean_expression = mean(expression, na.rm = TRUE), count = n())## # A tibble: 1 x 2
## mean_expression count
## <dbl> <int>
## 1 0.00328 199332
group_by()
# The exactly above code applied to a data frame grouped by nutrient and rate,
# we get the average delay per date
cleaned_data %>%
group_by(nutrient, rate) %>%
summarise(mean_expression_by_rate = mean(expression, na.rm = TRUE), count = n())## # A tibble: 36 x 4
## # Groups: nutrient [?]
## nutrient rate mean_expression_by_rate count
## <chr> <dbl> <dbl> <int>
## 1 G 0.05 0.0209 5537
## 2 G 0.1 0.0171 5537
## 3 G 0.15 0.0388 5537
## 4 G 0.2 0.0479 5537
## 5 G 0.25 0.0145 5537
## 6 G 0.3 -0.00551 5537
## 7 L 0.05 0.0257 5537
## 8 L 0.1 0.0305 5537
## 9 L 0.15 0.0104 5537
## 10 L 0.2 0.00216 5537
## # ... with 26 more rows
mean(), median()sd()min(), max()
cleaned_data %>%
group_by(nutrient, rate) %>%
summarise(
least_expr = min(expression),
most_expr = max(expression)
)mean(), median()sd()min(), max()
cleaned_data %>%
filter(!is.na(expression)) %>%
group_by(nutrient, rate) %>%
summarise(
least_expr = min(expression),
most_expr = max(expression)
)cleaned_data %>%
filter(BP == "leucine biosynthesis") %>%
ggplot(aes(rate, expression, color = nutrient)) +
geom_point() +
geom_line() +
facet_wrap(~name)dplyr and tidyr functions to manipulate your data:gather()separate()filter()select()mutate()arrange()summarise()