[英]the condition has length > 1 and only the first element will be used Argument issue
我目前正在从该网站抓取数据。 www.allmusic.com
通过这种方式,我像这样在每个艺术家页面上专门抓取音乐风格。
rm(list= ls())
library(xml2)
library(rvest)
library(magrittr)
library(stringr)
library(plyr)
library(dplyr)
library(tidyr)
library(knitr)
library(XML)
library(data.table)
setwd("~/AlbumScrape")
load("popalbums2004-2014.rdata")
pages <- popalbums2004-2014.rdata %>% slice(1:100) %>%
mutate(URL = paste0("http://www.allmusic.com/album/",album,"/credits"))
credits_tmp = NULL
albumAttrs_tmp = NULL
album_genre_tmp = NULL
album_style_tmp = NULL
album_mood_tmp = NULL
album_theme_tmp = NULL
pb = txtProgressBar(min = 0, max = nrow(pages), initial = 0, char = "*", style = 3)
system.time(
for(i in seq(nrow(pages))) {
return_value <- try({
test403 <- read_html(paste0("http://www.allmusic.com/album /",pages$album[i],"/credits"))
})
if(class(return_value) == "try-error"){
print("failed");
} else {
print("succeeded")
src_url <- pages$URL[i] %>% read_html()
src_artist <- pages$src_artist[i] %>%
as.character()
album <- pages$album[i] %>%
as.character()
credits_artistURL <- try_default(
expr =
src_url %>%
html_nodes(".artist a") %>%
html_attr("href") %>%
str_trim() %>%
gsub("http://www.allmusic.com/album/","",.),
default = "", quiet = T)
credit_artistRole <- try_default(
expr =
src_url %>%
html_nodes(".credit") %>%
html_text() %>%
str_trim(),
default = "",
quiet = TRUE)
album_duration <-
try_default(
expr =
src_url %>%
html_nodes(".duration span") %>%
html_text() %>%
str_trim(),
default = "",
quiet = TRUE)
album_releaseDate <-
try_default(
expr =
src_url %>%
html_nodes(".release-date span") %>%
html_text() %>%
str_trim(),
default = "",
quiet = TRUE)
album_recordingDate <-
try_default(
expr =
src_url %>%
html_nodes(".recording-date div") %>%
html_text() %>%
str_trim(),
default = "",
quiet = TRUE)
album_genre <-
try_default(
expr =
src_url %>%
html_nodes(".genre a") %>%
html_attr("href") %>%
str_trim() %>%
gsub("http://www.allmusic.com/genre/","",.),
default = "",
quiet = TRUE)
album_style <-
try_default(
expr =
src_url %>%
html_nodes(".styles a") %>%
html_attr("href") %>%
str_trim() %>%
gsub("http://www.allmusic.com/style/","",.),
default = "",
quiet = TRUE)
album_mood <-
try_default(
expr =
src_url %>%
html_nodes(".mood a") %>%
html_attr("href") %>%
str_trim() %>%
gsub("http://www.allmusic.com/mood/","",.),
default = "",
quiet = TRUE)
album_theme <-
try_default(
src_url %>%
html_nodes(".theme a") %>%
html_attr("href") %>%
str_trim() %>%
gsub("http://www.allmusic.com/theme/","",.),
default = "",
quiet = TRUE)
credits_res_tmp <- try_default(
expr =
data.frame(credits_artistURL = credits_artistURL,
credit_artistRole = credit_artistRole,
src_artist = src_artist, album = album),
default =
data.frame(credits_artistURL = "ERROR",
credit_artistRole = "ERROR",
src_artist = src_artist, album = album), quiet = T) %>%
filter(., credits_artistURL != "#")
data_credits_tmp = list(credits_tmp, credits_res_tmp)
credits_tmp <- rbindlist(data_credits_tmp)
albumAttrs_res_tmp <- try_default(
expr =
data.frame(album_duration = album_duration,
album_releaseDate = album_releaseDate,
src_artist = src_artist, album = album),
default =
data.frame(album_duration = "ERROR",
album_releaseDate = "ERROR",
src_artist = src_artist, album = album), quiet = T)
data_albumAttrs_tmp = list(albumAttrs_tmp, albumAttrs_res_tmp)
albumAttrs_tmp <- rbindlist (data_albumAttrs_tmp)
album_genre_res_tmp <- try_default(
expr =
data.frame(album_genre = album_genre,
src_artist = src_artist, album = album),
default =
data.frame(album_genre = "ERROR",
src_artist = src_artist, album = album), quiet = T)
data_album_genre_tmp = list(album_style_tmp, album_genre_res_tmp)
album_genre_tmp <- rbindlist(data_album_genre_tmp)
album_style_res_tmp <- try_default(
expr =
data.frame(album_style = album_style,
src_artist = src_artist, album = album),
default =
data.frame(album_style = "ERROR",
src_artist = src_artist, album = album), quiet = T)
data_album_style_tmp = list(album_style_tmp, album_style_res_tmp)
album_style_tmp <- rbindlist(data_album_style_tmp)
album_mood_res_tmp <- try_default(
expr =
data.frame(album_mood = album_mood,
src_artist = src_artist, album = album),
default =
data.frame(album_mood = "ERROR",
src_artist = src_artist, album = album), quiet = T)
data_album_mood_tmp = list(album_mood_tmp, album_mood_res_tmp)
album_mood_tmp <- rbindlist (data_album_mood_tmp)
album_theme_res_tmp <- try_default(
expr =
data.frame(album_theme = album_theme,
src_artist = src_artist, album = album),
default =
data.frame(album_theme = "ERROR",
src_artist = src_artist, album = album), quiet = T)
data_album_theme_tmp = list(album_theme_tmp, album_theme_res_tmp)
album_theme_tmp <- rbindlist(data_album_theme_tmp)
Sys.sleep(runif(1, 0.01, 0.05))
if(i %% 10 == 0) {setTxtProgressBar(pb, i)}
}}
)
这为我带来了一些结果,但是与一些艺术家一起,我在样式字段中收到一些错误。
我制作了一个模仿第一个脚本的单独脚本,但仅加载出现错误的艺术家的保存数据框。
脚本是这样的。
rm(list= ls())
library(xml2)
library(rvest)
library(magrittr)
library(stringr)
library(plyr)
library(dplyr)
library(tidyr)
library(knitr)
library(XML)
library(data.table)
setwd("~/AlbumScrape")
load("style_errors.Rdata")
pages <- style_errors %>% slice(1:100) %>%
mutate(URL = paste0("http://www.allmusic.com/album/",album,"/credits"))
credits_tmp = NULL
albumAttrs_tmp = NULL
album_genre_tmp = NULL
album_style_tmp = NULL
album_mood_tmp = NULL
album_theme_tmp = NULL
pb = txtProgressBar(min = 0, max = nrow(pages), initial = 0, char = "*", style = 3)
system.time(
for(i in seq(nrow(pages))) {
return_value <- try({
test403 <- read_html(paste0("http://www.allmusic.com/album/",pages$album[i],"/credits"))
})
if(class(return_value) == "try-error"){
print("failed");
} else {
print("succeeded")
src_url <- pages$URL[i] %>% read_html()
src_artist <- pages$src_artist[i] %>%
as.character()
album <- pages$album[i] %>%
as.character()
album_style <-
try_default(
expr =
src_url %>%
html_nodes(".styles a") %>%
html_attr("href") %>%
str_trim() %>%
gsub("http://www.allmusic.com/style/","",.),
default = "",
quiet = TRUE)
album_style_res_tmp <- try_default(
expr =
data.frame(album_style = album_style,
src_artist = src_artist, album = album),
default =
data.frame(album_style = "ERROR",
src_artist = src_artist, album = album), quiet = T)
data_album_style_tmp = list(album_style_tmp, album_style_res_tmp)
album_style_tmp <- rbindlist(data_album_style_tmp)
Sys.sleep(runif(1, 0.01, 0.05))
if(i %% 10 == 0) {setTxtProgressBar(pb, i)}
}}
)
但是,使用此工具后,我在抓取中仍然遇到错误。
1:In if (class(return_value) == "try-error") { ... :
the condition has length > 1 and only the first element will be used
2: In if (class(return_value) == "try-error") { ... : etc etc
我认为问题出在我来自某些网页(例如http://www.allmusic.com/album/ultimate-darkness-mw0000481576/credits)中的论点,其中有多个字段,因此在我的网站中创建了超过1个单位长度的内容逻辑。
我不确定如何更改代码以包括这些因素,因此我可以收集样式字段。
返回对象具有多个类。 当使用if ()
,括号中的表达式需要计算为一个逻辑长度(或可以强制为1的值)。 如果该类的长度> 1,那么您将获得一个长度> 1的逻辑向量,这将导致您看到警告 。 例如:
foo <- 1:10
class(foo)
class(foo) <- c("myclass", class(foo))
class(foo)
class(foo) == "myclass"
> class(foo)
[1] "integer"
> class(foo) <- c("myclass", class(foo))
> class(foo)
[1] "myclass" "integer"
> class(foo) == "myclass"
[1] TRUE FALSE
> if(class(foo) == "myclass") {
+ print("Yay!")
+ }
[1] "Yay!"
Warning message:
In if (class(foo) == "myclass") { :
the condition has length > 1 and only the first element will be used
测试类成员资格的正确方法是通过类的继承。 这是通过使用inherits()
函数完成的:
> if(inherits(foo, "myclass")) {
+ print("Yay!")
+ }
[1] "Yay!"
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.