[英]R: Reshaping irregular time series data without explicit unique dates
我正在研究讀取“時間”列和“數據”列的格式不整齊的每月時間序列數據的方法。 例如,來自SEMI的此電子表格具有按月和地區組織的數據塊,但年份是分開的,並且是不連續的塊,其中YYYY格式的年份作為每個塊之前的標題。
我的目標是將此數據轉換為連續的塊,第1列為每月日期,第2:6列為區域數據。 將此電子表格導出為制表符分隔的文件后(我發現gdata
和XLConnect
都存在您可以在屏幕快照中看到的那種合並單元格的問題),我將其讀入並獲取了一個子集,這是下面dput
的來源。
我采用了首先使用以下方法剝離空行的方法:
mydf <- mydf[which(grepl("^$", mydf$January) == FALSE),]
然后在具有年份的行的“區域”列中添加標簽-通常,該標簽總是顯示在第二(“一月”)列中。
mydf[which(nchar(mydf$January) == 4) ,'Region'] <- 'mydate'
下一步是在這些“年份”行的1月至12月的列中填寫每月日期。 我認為,一旦每個月都有一個唯一的日期,我就可以使用ddply
或其他方法來處理它。
mydf[which(mydf$Region == 'mydate'), 2:13] <- apply(mydf[which(mydf$Region == 'mydate'), 2:13], 1, function(x) as.character(seq(as.Date(paste(x['January'],"-01-01", sep = "")), as.Date(paste(x['January'],"-12-01", sep = "")), by = 'month')))
這並沒有按我預期的那樣工作,因為apply
函數沒有按照我希望的方式生成日期-它們沒有順序。 我將不勝感激(a) apply
於apply
步驟的特定修補程序或(b)指針可能更簡單或更容易的替代方法。
數據和代碼如下:
mydf <- structure(list(Region = c("", "Americas", "Europe", "Japan",
"Asia Pacific", "Worldwide", "", "", "Americas", "Europe", "Japan",
"Asia Pacific", "Worldwide", "", "", "Americas", "Europe", "Japan",
"Asia Pacific", "Worldwide", "", "", "", "Americas", "Europe",
"Japan", "Asia Pacific", "Worldwide", "", "", "Americas", "Europe",
"Japan", "Asia Pacific", "Worldwide"), January = c("1980", "413136",
"189577", "34033", "39868", "676614", "", "1981", "445504", "277290",
"33970", "44642", "801406", "", "1982", "445300", "226274", "34404",
"44989", "750967", "", "January", "1983", "457604", "232443",
"34326", "46247", "770621", "", "1984", "731009", "285740", "205644",
"85426", "1307820"), February = c("", "423748", "234818", "35104",
"42398", "736069", "", "", "440225", "274526", "33795", "44005",
"792550", "", "", "438332", "226806", "33359", "44020", "742517",
"", "February", "", "457899", "233560", "32604", "46184", "770247",
"", "", "790963", "307735", "381282", "102791", "1582770"), March = c("",
"436152", "281353", "34456", "46555", "798516", "", "", "434628",
"267259", "33709", "45206", "780802", "", "", "441313", "235612",
"32380", "43600", "752905", "", "March", "", "459498", "234986",
"31544", "48178", "774206", "", "", "856970", "339674", "574527",
"118091", "1889262"), April = c("", "455673", "288710", "34451",
"48585", "827419", "", "", "443285", "264405", "34823", "47192",
"789705", "", "", "465613", "246425", "33618", "46274", "791930",
"", "April", "", "484299", "243867", "32719", "52333", "813218",
"", "", "909873", "364465", "627400", "126954", "2028693"), May = c("",
"474441", "297343", "35092", "51102", "857977", "", "", "451221",
"255887", "35499", "48459", "791065", "", "", "487738", "249522",
"34339", "47727", "819325", "", "May", "", "507807", "246136",
"34708", "59300", "847950", "", "", "969553", "382706", "655862",
"133455", "2141576"), June = c("", "475552", "299427", "35743",
"51440", "862162", "", "", "453152", "242889", "35798", "48147",
"779986", "", "", "488564", "241273", "34360", "48871", "813068",
"", "June", "", "528620", "246710", "37345", "62910", "875586",
"", "", "991274", "388697", "672773", "135550", "2188294"), July = c("",
"473007", "302075", "37771", "51027", "863880", "", "", "454387",
"231097", "35402", "47468", "768353", "", "", "480702", "229555",
"33915", "49112", "793284", "", "July", "", "543063", "241211",
"40403", "66658", "891335", "", "", "1005742", "395852", "683854",
"138853", "2224302"), August = c("", "462125", "294497", "37628",
"49773", "844023", "", "", "450648", "213017", "34363", "46614",
"744642", "", "", "472486", "215763", "32866", "48620", "769734",
"", "August", "", "565034", "236353", "42524", "66853", "910763",
"", "", "1010739", "393337", "691731", "141101", "2236908"),
September = c("", "461968", "295501", "37310", "50280", "845059",
"", "", "459276", "215403", "33801", "47297", "755777", "",
"", "475729", "219643", "33083", "47540", "775994", "", "September",
"", "593019", "244979", "44108", "70242", "952348", "", "",
"1035725", "408658", "698992", "141944", "2285320"), October = c("",
"459862", "296522", "36399", "51220", "844003", "", "", "465096",
"218792", "34168", "47369", "765424", "", "", "467151", "225828",
"33667", "47890", "774536", "", "October", "", "618854",
"259807", "47622", "71345", "997628", "", "", "1033560",
"421043", "710563", "140154", "2305320"), November = c("",
"456832", "296283", "35769", "50531", "839415", "", "", "467288",
"232593", "35039", "47415", "782335", "", "", "461950", "237117",
"35672", "47285", "782024", "", "November", "", "641864",
"275099", "50371", "72095", "1039428", "", "", "1008836",
"441652", "732948", "133861", "2317297"), December = c("",
"460343", "291348", "35781", "48298", "835771", "", "", "460574",
"231461", "35971", "47173", "775179", "", "", "462919", "235861",
"36251", "47974", "783006", "", "December", "", "672533",
"276525", "54603", "74717", "1078379", "", "", "982210",
"442448", "731546", "132982", "2289187")), .Names = c("Region",
"January", "February", "March", "April", "May", "June", "July",
"August", "September", "October", "November", "December"), row.names = 29:63, class = "data.frame")
mydf <- mydf[which(grepl("^$", mydf$January) == FALSE),] # remove rows with nothing in the January column
mydf[which(nchar(mydf$January) == 4) ,'Region'] <- 'mydate' # add a row label for 'year' rows
mydf[which(mydf$Region == 'mydate'), 2:13] <- apply(mydf[which(mydf$Region == 'mydate'), 2:13], 1, function(x) as.character(seq(as.Date(paste(x['January'],"-01-01", sep = "")), as.Date(paste(x['January'],"-12-01", sep = "")), by = 'month')))
您可以使用xlsReadWrite
和reshape2
library(xlsReadWrite)
tdata<-read.xls('GSR1976-June 2012.xls',stringsAsFactors=F)
tdata[85,2]<-1987 # fix for missing year
tdata[228,2]<-2007 # fix for missing year
year.marker<-c(grep('^[[:digit:]]{4}$',tdata[,2]),270)
temp.df<-NULL
for(i in seq_along(year.marker)[-length(year.marker)]){
dum.df<-cbind(tdata[year.marker[i],2],tdata[(year.marker[i]+1):(year.marker[i+1]-2),])
temp.df<-rbind(temp.df,dum.df)
}
names(temp.df)<-c('year','region',month.name)
df1<-temp.df[!temp.df[,'region']=='',]
library(reshape2)
df2<-melt(df1, id.vars=c("region", "year"))
我采取以下方法:
首先,我將文件轉換為CSV,然后讀取其中的行。我使用grep()
查找“美國”,這是每組中的第一行。 我手動輸入了開始年份和結束年份,但是在那里也可能會使用一些grep
。
temp = readLines("GSR1976-June 2012.csv")
START = grep("Americas", temp)
YEARS = 1976:2012
之后,我創建了一個data.frame
列表,每年一次。
temp1 = lapply(1:length(YEARS),
function(x) read.csv("GSR1976-June 2012.csv",
header=FALSE, skip=START[x]-1,
nrows=5))
names(temp1) = YEARS
然后,我將它們組合到一個data.frame
並進行了一些清理。
temp2 = do.call(rbind, temp1)
names(temp2) = c("region", "jan", "feb", "mar", "apr", "may", "jun",
"jul", "aug", "sep", "oct", "nov", "dec")
temp2$year = rep(YEARS, each=5)
您沒有指定要執行的重塑類型,但如果要從長到長,最簡單的方法是使用reshape2
包:
library(reshape2)
temp3 = melt(temp2, id.vars=c("region", "year"))
list(head(temp3), tail(temp3))
# [[1]]
# region year variable value
# 1 Americas 1976 jan NA
# 2 Europe 1976 jan NA
# 3 Japan 1976 jan NA
# 4 Asia Pacific 1976 jan NA
# 5 Worldwide 1976 jan NA
# 6 Americas 1977 jan 195638
#
# [[2]]
# region year variable value
# 2215 Worldwide 2011 dec 23832532
# 2216 Americas 2012 dec NA
# 2217 Europe 2012 dec NA
# 2218 Japan 2012 dec NA
# 2219 Asia Pacific 2012 dec NA
# 2220 Worldwide 2012 dec NA
然后,對於聽起來像您要找的輸出,請使用dcast()
:
temp4 = dcast(temp3, year + variable ~ region)
head(temp4)
# year variable Americas Asia Pacific Europe Japan Worldwide
# 1 1976 jan NA NA NA NA NA
# 2 1976 feb NA NA NA NA NA
# 3 1976 mar 178295 16761 55602 10805 261463
# 4 1976 apr 178961 16513 60959 11589 268022
# 5 1976 may 187076 17396 62329 12435 279235
# 6 1976 jun 193675 17712 61676 14411 287475
可以使用XLConnect輕松地從Excel文件中直接處理上述數據集,如下所示:
require(XLConnect)
require(reshape2)
# Load Excel workbook
wb = loadWorkbook("~/Downloads/GSR1976-June 2012.xls")
# Read data from 1st worksheet, starting at row 7 with predefined column types
data = readWorksheet(wb, sheet = 1, startRow = 7,
colTypes = c("character", rep("numeric", 12)))
# Rename first column and keep month names
colnames(data)[1] = "Region"
months = names(data)[-1]
# The data of merged cells (years) is in the first cell of the merged region
years = ifelse(is.na(data$Region), data$January, NA)
idx = !is.na(years)
# Replicate year information to form a new column 'Year'
data$Year = rep(years[idx], times = diff(c(which(idx), length(years) + 1)))
# Remove any rows where 'Region' is missing (^= non-data rows)
data = data[!is.na(data$Region), ]
# Reshape (wide --> long)
data = melt(data, measure.vars = months, variable.name = "Month")
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.