R数据整形术之 tidyr

Happy families are all alike; every unhappy family is unhappy in its own way — Leo Tolstoy

R数据整形包之一tidyr最近迎来更新(tidyr 0.4.0)，所以有必要对其Tidy data进行学习。
以下为个人简单总结：

#gather--mutate--separate--select--arrange
setwd("F:/Rwork/tidyr")
library(tidyr)

## Warning: package 'tidyr' was built under R version 3.2.3

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.2.2

## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

preg <-read.csv("preg.csv",stringsAsFactors = FALSE)
preg

##           name treatmenta treatmentb
## 1   John Smith         NA         18
## 2     Jane Doe          4          1
## 3 Mary Johnson          6          7

preg6 <-tbl_df(read.csv("preg.csv",stringsAsFactors = FALSE))
preg6

## Source: local data frame [3 x 3]
## 
##           name treatmenta treatmentb
##          (chr)      (int)      (int)
## 1   John Smith         NA         18
## 2     Jane Doe          4          1
## 3 Mary Johnson          6          7

preg2<-preg %>% 
  gather(treatment,n,treatmenta:treatmentb) %>%  
#The first argument, is the name of the key column, which is the name of the variable defined by the values of the column headings. 
#The second argument is the name of the value column.
#The third argument defines the columns to gather, here, every column except religion.
 #gather(treatment,n,-name,na.rm = TRUE)    #the same above # na.rm to drop any missing values from the gather columns
  mutate(treatment=gsub("treatment","",treatment)) %>%
  arrange(name,treatment)   #arrange=sort
preg2

##           name treatment  n
## 1     Jane Doe         a  4
## 2     Jane Doe         b  1
## 3   John Smith         a NA
## 4   John Smith         b 18
## 5 Mary Johnson         a  6
## 6 Mary Johnson         b  7

#each deplay one by one 
preg3<-preg %>% 
  gather(treatment,n,treatmenta:treatmentb)
preg3

##           name  treatment  n
## 1   John Smith treatmenta NA
## 2     Jane Doe treatmenta  4
## 3 Mary Johnson treatmenta  6
## 4   John Smith treatmentb 18
## 5     Jane Doe treatmentb  1
## 6 Mary Johnson treatmentb  7

preg33<-preg3 %>% separate(treatment, c("Treatments", "group"),9 )  #separate
preg33

##           name Treatments group  n
## 1   John Smith  treatment     a NA
## 2     Jane Doe  treatment     a  4
## 3 Mary Johnson  treatment     a  6
## 4   John Smith  treatment     b 18
## 5     Jane Doe  treatment     b  1
## 6 Mary Johnson  treatment     b  7

preg333<-preg33 %>% select(name,group,n)  # select
preg333

##           name group  n
## 1   John Smith     a NA
## 2     Jane Doe     a  4
## 3 Mary Johnson     a  6
## 4   John Smith     b 18
## 5     Jane Doe     b  1
## 6 Mary Johnson     b  7

preg3333<-preg333 %>% spread(group,n)  #spread: one column become two column
preg3333

##           name  a  b
## 1     Jane Doe  4  1
## 2   John Smith NA 18
## 3 Mary Johnson  6  7

preg4<-preg3 %>% mutate(treatment=gsub("treatment","",treatment))
preg4

##           name treatment  n
## 1   John Smith         a NA
## 2     Jane Doe         a  4
## 3 Mary Johnson         a  6
## 4   John Smith         b 18
## 5     Jane Doe         b  1
## 6 Mary Johnson         b  7

preg5<-preg4 %>% arrange(name,treatment)
preg5

##           name treatment  n
## 1     Jane Doe         a  4
## 2     Jane Doe         b  1
## 3   John Smith         a NA
## 4   John Smith         b 18
## 5 Mary Johnson         a  6
## 6 Mary Johnson         b  7

#reads all files from the same locaed pathway  into a single data frame.
library(plyr)

## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## 
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

paths <- dir("F:/Rwork/tidyr", pattern = "\\.csv$", full.names = TRUE)
names(paths) <- basename(paths)
all<-ldply(paths, read.csv, stringsAsFactors = FALSE)
all

##               .id         name treatmenta treatmentb
## 1 preg - Copy.csv   John Smith         NA         18
## 2 preg - Copy.csv     Jane Doe          4          1
## 3 preg - Copy.csv Mary Johnson          6          7
## 4        preg.csv   John Smith         NA         18
## 5        preg.csv     Jane Doe          4          1
## 6        preg.csv Mary Johnson          6          7

#get some data from name column incloud John Smith and Jane Doe  located in preg3 data frame 
subset(preg3, name %in% c("John Smith", "Jane Doe"))

##         name  treatment  n
## 1 John Smith treatmenta NA
## 2   Jane Doe treatmenta  4
## 4 John Smith treatmentb 18
## 5   Jane Doe treatmentb  1