R数据整形术之 tidyr

Happy families are all alike; every unhappy family is unhappy in its own way — Leo Tolstoy



R数据整形包之一tidyr最近迎来更新(tidyr 0.4.0),所以有必要对其Tidy data进行学习。
以下为个人简单总结:

#gather--mutate--separate--select--arrange
setwd("F:/Rwork/tidyr")
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.2.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.2
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
preg <-read.csv("preg.csv",stringsAsFactors = FALSE)
preg
##           name treatmenta treatmentb
## 1   John Smith         NA         18
## 2     Jane Doe          4          1
## 3 Mary Johnson          6          7
preg6 <-tbl_df(read.csv("preg.csv",stringsAsFactors = FALSE))
preg6
## Source: local data frame [3 x 3]
## 
##           name treatmenta treatmentb
##          (chr)      (int)      (int)
## 1   John Smith         NA         18
## 2     Jane Doe          4          1
## 3 Mary Johnson          6          7
preg2<-preg %>% 
  gather(treatment,n,treatmenta:treatmentb) %>%  
#The first argument, is the name of the key column, which is the name of the variable defined by the values of the column headings. 
#The second argument is the name of the value column.
#The third argument defines the columns to gather, here, every column except religion.
 #gather(treatment,n,-name,na.rm = TRUE)    #the same above # na.rm to drop any missing values from the gather columns
  mutate(treatment=gsub("treatment","",treatment)) %>%
  arrange(name,treatment)   #arrange=sort
preg2
##           name treatment  n
## 1     Jane Doe         a  4
## 2     Jane Doe         b  1
## 3   John Smith         a NA
## 4   John Smith         b 18
## 5 Mary Johnson         a  6
## 6 Mary Johnson         b  7
#each deplay one by one 
preg3<-preg %>% 
  gather(treatment,n,treatmenta:treatmentb)
preg3
##           name  treatment  n
## 1   John Smith treatmenta NA
## 2     Jane Doe treatmenta  4
## 3 Mary Johnson treatmenta  6
## 4   John Smith treatmentb 18
## 5     Jane Doe treatmentb  1
## 6 Mary Johnson treatmentb  7
preg33<-preg3 %>% separate(treatment, c("Treatments", "group"),9 )  #separate
preg33
##           name Treatments group  n
## 1   John Smith  treatment     a NA
## 2     Jane Doe  treatment     a  4
## 3 Mary Johnson  treatment     a  6
## 4   John Smith  treatment     b 18
## 5     Jane Doe  treatment     b  1
## 6 Mary Johnson  treatment     b  7
preg333<-preg33 %>% select(name,group,n)  # select
preg333
##           name group  n
## 1   John Smith     a NA
## 2     Jane Doe     a  4
## 3 Mary Johnson     a  6
## 4   John Smith     b 18
## 5     Jane Doe     b  1
## 6 Mary Johnson     b  7
preg3333<-preg333 %>% spread(group,n)  #spread: one column become two column
preg3333
##           name  a  b
## 1     Jane Doe  4  1
## 2   John Smith NA 18
## 3 Mary Johnson  6  7
preg4<-preg3 %>% mutate(treatment=gsub("treatment","",treatment))
preg4
##           name treatment  n
## 1   John Smith         a NA
## 2     Jane Doe         a  4
## 3 Mary Johnson         a  6
## 4   John Smith         b 18
## 5     Jane Doe         b  1
## 6 Mary Johnson         b  7
preg5<-preg4 %>% arrange(name,treatment)
preg5
##           name treatment  n
## 1     Jane Doe         a  4
## 2     Jane Doe         b  1
## 3   John Smith         a NA
## 4   John Smith         b 18
## 5 Mary Johnson         a  6
## 6 Mary Johnson         b  7
#reads all files from the same locaed pathway  into a single data frame.
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## 
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
paths <- dir("F:/Rwork/tidyr", pattern = "\\.csv$", full.names = TRUE)
names(paths) <- basename(paths)
all<-ldply(paths, read.csv, stringsAsFactors = FALSE)
all
##               .id         name treatmenta treatmentb
## 1 preg - Copy.csv   John Smith         NA         18
## 2 preg - Copy.csv     Jane Doe          4          1
## 3 preg - Copy.csv Mary Johnson          6          7
## 4        preg.csv   John Smith         NA         18
## 5        preg.csv     Jane Doe          4          1
## 6        preg.csv Mary Johnson          6          7
#get some data from name column incloud John Smith and Jane Doe  located in preg3 data frame 
subset(preg3, name %in% c("John Smith", "Jane Doe"))
##         name  treatment  n
## 1 John Smith treatmenta NA
## 2   Jane Doe treatmenta  4
## 4 John Smith treatmentb 18
## 5   Jane Doe treatmentb  1

本文固定链接:http://tiramisutes.github.io/2016/03/09/tidyr.html 转载请注明出处!

tiramisutes wechat

热评文章