下载多个txt文件R

Question

我想下载许多.txt文件。 我有一个数据框“” New_test，其中url在“ url”下，目标名称在“ code”下

“ New_test.txt”

"url"   "code"
"1" "http://documents.worldbank.org/curated/en/704931468739539459/text/multi-page.txt" "704931468739539459.txt"
"2" "http://documents.worldbank.org/curated/en/239491468743788559/text/multi-page.txt"  "239491468743788559.txt"
"3" "http://documents.worldbank.org/curated/en/489381468771867920/text/multi-page.txt"  "489381468771867920.txt"
"4" "http://documents.worldbank.org/curated/en/663271468778456388/text/multi-page.txt"  "663271468778456388.txt"
"5" "http://documents.worldbank.org/curated/en/330661468742793711/text/multi-page.txt"  "330661468742793711.txt"
"6" "http://documents.worldbank.org/curated/en/120441468766519490/text/multi-page.txt"  "120441468766519490.txt"
"7" "http://documents.worldbank.org/curated/en/901481468770727038/text/multi-page.txt"  "901481468770727038.txt"
"8" "http://documents.worldbank.org/curated/en/172351468740162422/text/multi-page.txt"  "172351468740162422.txt"
"9" "http://documents.worldbank.org/curated/en/980401468740176249/text/multi-page.txt"  "980401468740176249.txt"
"10" "http://documents.worldbank.org/curated/en/166921468759906515/text/multi-page.txt" "166921468759906515.txt"
"11" "http://documents.worldbank.org/curated/en/681071468781809792/text/DRD169.txt" "681071468781809792.txt"
"12" "http://documents.worldbank.org/curated/en/358291468739333041/text/multi-page.txt" "358291468739333041.txt"
"13" "http://documents.worldbank.org/curated/en/716041468759870921/text/multi0page.txt" "716041468759870921.txt"
"14" "http://documents.worldbank.org/curated/en/961101468763752879/text/34896.txt"  "961101468763752879.txt"`

这是脚本

rm(list=ls())

require(quanteda)
library(stringr)

workingdir <-setwd("~/Study/Master/Thesis/Mining/R/WorldBankDownl") 
test <- read.csv(paste0(workingdir,"/New_test.txt"), header = TRUE, 
stringsAsFactors = FALSE, sep="\t")

#Loop through every url in test_df and download in target directory with name = code
 for (url in test) {
 print(head(url))
 print(head(test$code))
 destfile <- paste0('~/Study/Master/Thesis/Mining/R/WorldBankDownl/Sources/', test$code)
 download.file(test$url, destfile,  method = "wget", quiet=TRUE)

这是我得到的错误

Error in download.file(test$url, destfile, method = "wget", quiet = TRUE) : 
'url' must be a length-one character vector

Answer 1

这是一种简单的方法。 您将需要用test$url代替txturls （这两个字符向量都带有文本文件的URL）。

txturls <- c("http://documents.worldbank.org/curated/en/704931468739539459/text/multi-page.txt", 
             "http://documents.worldbank.org/curated/en/239491468743788559/text/multi-page.txt",
             "http://documents.worldbank.org/curated/en/489381468771867920/text/multi-page.txt")

library("quanteda")

txt <- character()
for (i in txturls) {
    # read the file from the URL
    temp <- readLines(url("http://documents.worldbank.org/curated/en/704931468739539459/text/multi-page.txt"))
    # concatenate lines into one text
    temp <- texts(temp, groups = 1)
    # remove form feed character
    txt <- gsub("\\f", "", txt)
    # concatenate into the vector
    txt <- c(txt, temp)
}

# form the quanteda corpus
urlcorp <- corpus(txt, docvars = data.frame(source = txturls, stringsAsFactors = FALSE))
summary(urlcorp)
# Corpus consisting of 3 documents:
# 
#  Text Types Tokens Sentences                                                                           source
#     1  1343   5125       135 http://documents.worldbank.org/curated/en/704931468739539459/text/multi-page.txt
#   1.1  1343   5125       135 http://documents.worldbank.org/curated/en/239491468743788559/text/multi-page.txt
#   1.2  1343   5125       135 http://documents.worldbank.org/curated/en/489381468771867920/text/multi-page.txt
# 
# Source: /Users/kbenoit/Dropbox (Personal)/GitHub/dictionaries_paper/* on x86_64 by kbenoit
# Created: Tue Mar 20 15:51:05 2018
# Notes:

Answer 2

大家好，谢谢您对我的帮助。 对我来说，解决方案正在改变我使用的方法。 'wget'在'url'中要求1个url，在'destfile'中要求相同。 '长度为一的字符向量''url'和'destfile'均为长度为十四的字符向量。 现在，我使用方法“ libcurl”，该方法要求字符向量的长度在“ url”和“ destfile”中相等。 如果使用此方法，请确保安静= TRUE。

此外，可能有一个工作循环，但您收到错误。

Error in download.file(test$url, destfile, method = "libcurl", quiet = TRUE) : 
cannot download any files
In addition: There were 50 or more warnings (use warnings() to see the first 50)

这意味着您的源代码无法跟上您的呼叫，您基本上是在使用DDOS，因此必须降低循环速度。

rm(list=ls())

require(quanteda)
library(stringr)

workingdir <-setwd("~/Study/Master/Thesis/Mining/R/WorldBankDownl") 
test <- read.csv(paste0(workingdir,"/New_test.txt"), header = TRUE, 
stringsAsFactors = FALSE, sep="\t")

test <- data.frame(test)


#Loop through every url in comb_df and download in target directory with name = code, if you get an error and no files are downloaded try to slow down the loop.
for (url in test) {
 print(head(url))
 destfile <- paste0(workingdir, '/Sources/WB_', test$code)
 download.file(test$url, destfile, method = "libcurl", quiet = TRUE)
}

下载多个txt文件R

问题描述

2 个解决方案

解决方案1
0 2018-03-20 15:49:10

解决方案2
0 已采纳 2018-03-21 14:24:54

下载多个txt文件R

问题描述

2 个解决方案

解决方案1 0 2018-03-20 15:49:10

解决方案2 0 已采纳 2018-03-21 14:24:54

解决方案1
0 2018-03-20 15:49:10

解决方案2
0 已采纳 2018-03-21 14:24:54