Happy families are all alike; every unhappy family is unhappy in its own way — Leo Tolstoy
R数据整形包之一tidyr最近迎来更新(tidyr 0.4.0),所以有必要对其Tidy data进行学习。
以下为个人简单总结:
#gather--mutate--separate--select--arrange
setwd("F:/Rwork/tidyr")
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.2.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.2
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
preg <-read.csv("preg.csv",stringsAsFactors = FALSE)
preg
## name treatmenta treatmentb
## 1 John Smith NA 18
## 2 Jane Doe 4 1
## 3 Mary Johnson 6 7
preg6 <-tbl_df(read.csv("preg.csv",stringsAsFactors = FALSE))
preg6
## Source: local data frame [3 x 3]
##
## name treatmenta treatmentb
## (chr) (int) (int)
## 1 John Smith NA 18
## 2 Jane Doe 4 1
## 3 Mary Johnson 6 7
preg2<-preg %>%
gather(treatment,n,treatmenta:treatmentb) %>%
#The first argument, is the name of the key column, which is the name of the variable defined by the values of the column headings.
#The second argument is the name of the value column.
#The third argument defines the columns to gather, here, every column except religion.
#gather(treatment,n,-name,na.rm = TRUE) #the same above # na.rm to drop any missing values from the gather columns
mutate(treatment=gsub("treatment","",treatment)) %>%
arrange(name,treatment) #arrange=sort
preg2
## name treatment n
## 1 Jane Doe a 4
## 2 Jane Doe b 1
## 3 John Smith a NA
## 4 John Smith b 18
## 5 Mary Johnson a 6
## 6 Mary Johnson b 7
#each deplay one by one
preg3<-preg %>%
gather(treatment,n,treatmenta:treatmentb)
preg3
## name treatment n
## 1 John Smith treatmenta NA
## 2 Jane Doe treatmenta 4
## 3 Mary Johnson treatmenta 6
## 4 John Smith treatmentb 18
## 5 Jane Doe treatmentb 1
## 6 Mary Johnson treatmentb 7
preg33<-preg3 %>% separate(treatment, c("Treatments", "group"),9 ) #separate
preg33
## name Treatments group n
## 1 John Smith treatment a NA
## 2 Jane Doe treatment a 4
## 3 Mary Johnson treatment a 6
## 4 John Smith treatment b 18
## 5 Jane Doe treatment b 1
## 6 Mary Johnson treatment b 7
preg333<-preg33 %>% select(name,group,n) # select
preg333
## name group n
## 1 John Smith a NA
## 2 Jane Doe a 4
## 3 Mary Johnson a 6
## 4 John Smith b 18
## 5 Jane Doe b 1
## 6 Mary Johnson b 7
preg3333<-preg333 %>% spread(group,n) #spread: one column become two column
preg3333
## name a b
## 1 Jane Doe 4 1
## 2 John Smith NA 18
## 3 Mary Johnson 6 7
preg4<-preg3 %>% mutate(treatment=gsub("treatment","",treatment))
preg4
## name treatment n
## 1 John Smith a NA
## 2 Jane Doe a 4
## 3 Mary Johnson a 6
## 4 John Smith b 18
## 5 Jane Doe b 1
## 6 Mary Johnson b 7
preg5<-preg4 %>% arrange(name,treatment)
preg5
## name treatment n
## 1 Jane Doe a 4
## 2 Jane Doe b 1
## 3 John Smith a NA
## 4 John Smith b 18
## 5 Mary Johnson a 6
## 6 Mary Johnson b 7
#reads all files from the same locaed pathway into a single data frame.
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
##
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
paths <- dir("F:/Rwork/tidyr", pattern = "\\.csv$", full.names = TRUE)
names(paths) <- basename(paths)
all<-ldply(paths, read.csv, stringsAsFactors = FALSE)
all
## .id name treatmenta treatmentb
## 1 preg - Copy.csv John Smith NA 18
## 2 preg - Copy.csv Jane Doe 4 1
## 3 preg - Copy.csv Mary Johnson 6 7
## 4 preg.csv John Smith NA 18
## 5 preg.csv Jane Doe 4 1
## 6 preg.csv Mary Johnson 6 7
#get some data from name column incloud John Smith and Jane Doe located in preg3 data frame
subset(preg3, name %in% c("John Smith", "Jane Doe"))
## name treatment n
## 1 John Smith treatmenta NA
## 2 Jane Doe treatmenta 4
## 4 John Smith treatmentb 18
## 5 Jane Doe treatmentb 1