简体   繁体   中英

Import multiple CSV files via HTTPS into R

I am trying to import multiple CSV files via HTTPS (from Google Drive Sheets) into R.

Here's what I did to import one CSV file using RCurl (which worked):

#Load packages
require(RCurl)
require(plyr)

x <- getURL("https://docs.google.com/spreadsheet/pub?key=0AsDUegPJ1ngvdDFLWXZXb08wMVIzY3JrX2tNU2dROEE&output=csv")
x <- read.csv(textConnection(x), header = TRUE, stringsAsFactors = FALSE, skip=1)

Then, I created a data frame named "hashtags" with the URLs to 12 CSV files and their names in order to import all of the files. Here are the first six rows of hashtags

> head(hashtags)
name             url
1 #capstoneisfun https://docs.google.com/spreadsheet/pub?key=0AsDUegPJ1ngvdDFLWXZXb08wMVIzY3JrX2tNU2dROEE&output=csv
2 #CEP810        https://docs.google.com/spreadsheet/pub?key=0AsDUegPJ1ngvdFlQS2FPNzJsdS1TMVBuTHlQTS1FRnc&output=csv
3 #CEP811        https://docs.google.com/spreadsheet/pub?key=0AsDUegPJ1ngvdDhLcEI1a0U1T0I0Zm5RaU5UVWdmdlE&output=csv
4 #CEP812        https://docs.google.com/spreadsheet/pub?key=0AsDUegPJ1ngvdDJzMjZhN2pGa29QYU5weVhZdjRKdmc&output=csv
5 #CEP813        https://docs.google.com/spreadsheet/pub?key=0AsDUegPJ1ngvdGpJa0VMTmJNdzZ4UjBvUEx5cWsycEE&output=csv
6 #CEP815        https://docs.google.com/spreadsheet/pub?key=0AsDUegPJ1ngvdFB2R0czWjJ2SU9HQWR5VUVuODk3R0E&output=csv

What I'd like to do is import all of the files as data frames. I understand that an apply function or a for loop could do the trick, but both are a bit beyond my present capability.

This is a great place to use the curl() package which provides "a drop-in replacement for url() " that works with https:

library(curl)

urls <- c(
  "https://docs.google.com/spreadsheet/pub?key=0AsDUegPJ1ngvdDFLWXZXb08wMVIzY3JrX2tNU2dROEE&output=csv",
  "https://docs.google.com/spreadsheet/pub?key=0AsDUegPJ1ngvdFlQS2FPNzJsdS1TMVBuTHlQTS1FRnc&output=csv"
)

cons <- lapply(urls, curl)
lapply(cons, read.csv, stringsAsFactors = FALSE, skip = 1)

Here's one using httr (which improves upon RCurl and also makes for a better time on Windows) and data.table's rbindlist so you get a resultant data.table with all the tweets and hashtags in one object vs have to work through a list. Only using dplyr since it's something I use everyday now. Could easily remove and substitute base operations vs %>% :

library(httr)
library(dplyr)

hashtags <- read.table(text="hashtag,url
#capstoneisfun,https://docs.google.com/spreadsheet/pub?key=0AsDUegPJ1ngvdDFLWXZXb08wMVIzY3JrX2tNU2dROEE&output=csv
#CEP810,https://docs.google.com/spreadsheet/pub?key=0AsDUegPJ1ngvdFlQS2FPNzJsdS1TMVBuTHlQTS1FRnc&output=csv
#CEP811,https://docs.google.com/spreadsheet/pub?key=0AsDUegPJ1ngvdDhLcEI1a0U1T0I0Zm5RaU5UVWdmdlE&output=csv
#CEP812,https://docs.google.com/spreadsheet/pub?key=0AsDUegPJ1ngvdDJzMjZhN2pGa29QYU5weVhZdjRKdmc&output=csv
#CEP813,https://docs.google.com/spreadsheet/pub?key=0AsDUegPJ1ngvdGpJa0VMTmJNdzZ4UjBvUEx5cWsycEE&output=csv
#CEP815,https://docs.google.com/spreadsheet/pub?key=0AsDUegPJ1ngvdFB2R0czWjJ2SU9HQWR5VUVuODk3R0E&output=csv", 
                       stringsAs=FALSE, header=TRUE, sep=",", comment.char="")

tweets <- data.table::rbindlist(by(hashtags, hashtags$hashtag, function(x) {
  doc <- GET(x$url)
  dat <- read.csv(textConnection(content(doc, as="text")), header=TRUE, stringsAs=FALSE, sep=",", skip=1)
  dat <- dat %>% mutate(hashtag=x$hashtag)
  dat  
}))

nrow(tweets)
## [1] 1618

glimpse(tweets)

## Variables:
## $ Date         (chr) "12/12/2014 21:51:49", "11/19/2014 10:17:39", "11/16/2014 4:2...
## $ Twitter.User (chr) "https://twitter.com/matthewkoehler/status/543440594446868481...
## $ Followers    (int) 946, 895, 399, 12, 153, 881, 216, 865, 395, 12, 82, 857, 393,...
## $ Follows      (int) 994, 907, 1174, 24, 114, 887, 492, 869, 1148, 24, 201, 855, 1...
## $ Retweets     (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0...
## $ Favorites    (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0...
## $ Tweet.Text   (chr) "#capstoneisfun Awesome TA of the Week is @spgreenhalgh ! htt...
## $ hashtag      (chr) "#capstoneisfun", "#capstoneisfun", "#capstoneisfun", "#capst...

tweets$hashtag %>% unique

## [1] "#capstoneisfun" "#CEP810"        "#CEP811"        "#CEP812"       
## [5] "#CEP813"        "#CEP815"       

Perhaps:

dfList <-list()
for( i in 1:nrow(hashtags) ){ 
   x <- getURL( hashtags[i, "url"] )
   dfList[[ hashtags[i,1] ]] <- read.csv(textConnection(x), header = TRUE, 
                                         stringsAsFactors = FALSE, skip=1)
                            }

Seemed to be successful (although I see no need to load pkg::plyr and the code was tested without doing so.) Top of output from str(dfList):

str(dfList)
List of 6
 $ #capstoneisfun:'data.frame': 63 obs. of  7 variables:
  ..$ Date        : chr [1:63] "12/12/2014 21:51:49" "11/19/2014 10:17:39" "11/16/2014 4:29:39" "11/14/2014 5:44:57" ...
  ..$ Twitter.User: chr [1:63] "https://twitter.com/matthewkoehler/status/543440594446868481" "https://twitter.com/matthewkoehler/status/534930982802321408" "https://twitter.com/spgreenhalgh/status/533756240837771265" "https://twitter.com/sarahfkeenan/status/533050416087715840" ...
  ..$ Followers   : int [1:63] 946 895 399 12 153 881 216 865 395 12 ...
  ..$ Follows     : int [1:63] 994 907 1174 24 114 887 492 869 1148 24 ...
  ..$ Retweets    : int [1:63] 0 0 0 0 0 0 0 0 0 0 ...
  ..$ Favorites   : int [1:63] 0 0 0 0 0 0 0 0 0 0 ...
  ..$ Tweet.Text  : chr [1:63] "#capstoneisfun Awesome TA of the Week is @spgreenhalgh ! http://t.co/fbKqtHAhcl" "Module 12 is beginning! #capstoneisfun" "Had a fantastic time with #capstoneisfun students today in exhibitions! So fun to see everyone's portfolios as they're finishin"| __truncated__ "@emstrazz, your intended audience can 
 # snipped rest

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM