簡體   English   中英

條件的長度> 1,並且僅將使用第一個元素

[英]the condition has length > 1 and only the first element will be used Argument issue

我目前正在從該網站抓取數據。 www.allmusic.com

通過這種方式,我像這樣在每個藝術家頁面上專門抓取音樂風格。

rm(list= ls())

library(xml2)
library(rvest)
library(magrittr)
library(stringr)
library(plyr)
library(dplyr)
library(tidyr)
library(knitr)
library(XML)
library(data.table)

setwd("~/AlbumScrape")

load("popalbums2004-2014.rdata")

pages <- popalbums2004-2014.rdata %>% slice(1:100) %>%
  mutate(URL = paste0("http://www.allmusic.com/album/",album,"/credits"))

credits_tmp = NULL
albumAttrs_tmp = NULL
album_genre_tmp = NULL
album_style_tmp = NULL
album_mood_tmp = NULL
album_theme_tmp = NULL

pb = txtProgressBar(min = 0, max = nrow(pages), initial = 0, char = "*",     style = 3)

system.time(
  for(i in seq(nrow(pages))) {

    return_value <- try({

    test403 <- read_html(paste0("http://www.allmusic.com/album    /",pages$album[i],"/credits"))
})

if(class(return_value) == "try-error"){ 
print("failed");
} else {
print("succeeded")

src_url <- pages$URL[i] %>% read_html()

src_artist <- pages$src_artist[i] %>%
  as.character()

album <- pages$album[i] %>%
  as.character()

credits_artistURL <- try_default(
  expr =
    src_url %>%
    html_nodes(".artist a") %>%
    html_attr("href") %>%
    str_trim() %>%
    gsub("http://www.allmusic.com/album/","",.),
  default = "", quiet = T)

credit_artistRole <- try_default(
  expr = 
    src_url %>%
    html_nodes(".credit") %>%
    html_text() %>%
    str_trim(),
  default = "",
  quiet = TRUE)


album_duration <-
  try_default(
    expr = 
      src_url %>%
      html_nodes(".duration span") %>%
      html_text() %>%
      str_trim(),
    default = "",
    quiet = TRUE)

album_releaseDate <-
  try_default(
    expr = 
      src_url %>%
      html_nodes(".release-date span") %>%
      html_text() %>%
      str_trim(),
    default = "",
    quiet = TRUE)

album_recordingDate <-
  try_default(
    expr = 
      src_url %>%
      html_nodes(".recording-date div") %>%
      html_text() %>%
      str_trim(),
    default = "",
    quiet = TRUE)

album_genre <-
  try_default(
    expr = 
      src_url %>%
      html_nodes(".genre a") %>%
      html_attr("href") %>%
      str_trim() %>%
      gsub("http://www.allmusic.com/genre/","",.),
    default = "",
    quiet = TRUE)

album_style <-
  try_default(
    expr = 
      src_url %>%
      html_nodes(".styles a") %>%
      html_attr("href") %>%
      str_trim() %>%
      gsub("http://www.allmusic.com/style/","",.),
    default = "",
    quiet = TRUE)

album_mood <-
  try_default(
    expr = 
      src_url %>%
      html_nodes(".mood a") %>%
      html_attr("href") %>%
      str_trim() %>%
      gsub("http://www.allmusic.com/mood/","",.),
    default = "",
    quiet = TRUE)

album_theme <-
  try_default(
    src_url %>%
      html_nodes(".theme a") %>%
      html_attr("href") %>%
      str_trim() %>%
      gsub("http://www.allmusic.com/theme/","",.),
    default = "",
    quiet = TRUE)


credits_res_tmp <- try_default(
  expr = 
    data.frame(credits_artistURL = credits_artistURL,
               credit_artistRole = credit_artistRole,
               src_artist = src_artist, album = album),
  default = 
    data.frame(credits_artistURL = "ERROR",
               credit_artistRole = "ERROR",
               src_artist = src_artist, album = album), quiet = T) %>%
  filter(., credits_artistURL != "#")

data_credits_tmp = list(credits_tmp, credits_res_tmp)
credits_tmp <- rbindlist(data_credits_tmp)


albumAttrs_res_tmp <- try_default(
  expr = 
    data.frame(album_duration = album_duration,
               album_releaseDate = album_releaseDate,
               src_artist = src_artist, album = album),
  default = 
    data.frame(album_duration = "ERROR",
               album_releaseDate = "ERROR",
               src_artist = src_artist, album = album), quiet = T)

data_albumAttrs_tmp = list(albumAttrs_tmp, albumAttrs_res_tmp)
albumAttrs_tmp <- rbindlist (data_albumAttrs_tmp)


album_genre_res_tmp <- try_default(
  expr = 
    data.frame(album_genre = album_genre,
               src_artist = src_artist, album = album),
  default = 
    data.frame(album_genre = "ERROR",
               src_artist = src_artist, album = album), quiet = T)


data_album_genre_tmp = list(album_style_tmp, album_genre_res_tmp)
album_genre_tmp <- rbindlist(data_album_genre_tmp)



album_style_res_tmp <- try_default(
  expr = 
    data.frame(album_style = album_style,
               src_artist = src_artist, album = album),
  default = 
    data.frame(album_style = "ERROR",
               src_artist = src_artist, album = album), quiet = T)


data_album_style_tmp = list(album_style_tmp, album_style_res_tmp)
album_style_tmp <- rbindlist(data_album_style_tmp)


album_mood_res_tmp <- try_default(
  expr = 
    data.frame(album_mood = album_mood,
               src_artist = src_artist, album = album),
  default = 
    data.frame(album_mood = "ERROR",
               src_artist = src_artist, album = album), quiet = T)


data_album_mood_tmp = list(album_mood_tmp, album_mood_res_tmp)
album_mood_tmp <- rbindlist (data_album_mood_tmp)


album_theme_res_tmp <- try_default(
  expr = 
    data.frame(album_theme = album_theme,
               src_artist = src_artist, album = album),
  default = 
    data.frame(album_theme = "ERROR",
               src_artist = src_artist, album = album), quiet = T)

data_album_theme_tmp = list(album_theme_tmp, album_theme_res_tmp)
album_theme_tmp <- rbindlist(data_album_theme_tmp)

Sys.sleep(runif(1, 0.01, 0.05))

if(i %% 10 == 0) {setTxtProgressBar(pb, i)}

}}
)

這為我帶來了一些結果,但是與一些藝術家一起,我在樣式字段中收到一些錯誤。

我制作了一個模仿第一個腳本的單獨腳本,但僅加載出現錯誤的藝術家的保存數據框。

腳本是這樣的。

rm(list= ls())

library(xml2)
library(rvest)
library(magrittr)
library(stringr)
library(plyr)
library(dplyr)
library(tidyr)
library(knitr)
library(XML)
library(data.table)

setwd("~/AlbumScrape")

load("style_errors.Rdata")

pages <- style_errors %>% slice(1:100) %>%
  mutate(URL = paste0("http://www.allmusic.com/album/",album,"/credits"))

credits_tmp = NULL
albumAttrs_tmp = NULL
album_genre_tmp = NULL
album_style_tmp = NULL
album_mood_tmp = NULL
album_theme_tmp = NULL

pb = txtProgressBar(min = 0, max = nrow(pages), initial = 0, char = "*",     style = 3)

system.time(
  for(i in seq(nrow(pages))) {

return_value <- try({

  test403 <- read_html(paste0("http://www.allmusic.com/album/",pages$album[i],"/credits"))
})

if(class(return_value) == "try-error"){ 
  print("failed");
} else {
  print("succeeded")

  src_url <- pages$URL[i] %>% read_html()

  src_artist <- pages$src_artist[i] %>%
    as.character()

  album <- pages$album[i] %>%
    as.character()

  album_style <-
    try_default(
      expr = 
        src_url %>%
        html_nodes(".styles a") %>%
        html_attr("href") %>%
        str_trim() %>%
        gsub("http://www.allmusic.com/style/","",.),
      default = "",
      quiet = TRUE)

   album_style_res_tmp <- try_default(
    expr = 
      data.frame(album_style = album_style,
                 src_artist = src_artist, album = album),
    default = 
      data.frame(album_style = "ERROR",
                 src_artist = src_artist, album = album), quiet = T)


  data_album_style_tmp = list(album_style_tmp, album_style_res_tmp)
  album_style_tmp <- rbindlist(data_album_style_tmp)

  Sys.sleep(runif(1, 0.01, 0.05))

  if(i %% 10 == 0) {setTxtProgressBar(pb, i)}

  }}
 )

但是,使用此工具后,我在抓取中仍然遇到錯誤。

1:In if (class(return_value) == "try-error") { ... :
the condition has length > 1 and only the first element will be used
2: In if (class(return_value) == "try-error") { ... : etc etc

我認為問題出在我來自某些網頁(例如http://www.allmusic.com/album/ultimate-darkness-mw0000481576/credits)中的論點,其中有多個字段,因此在我的網站中創建了超過1個單位長度的內容邏輯。

我不確定如何更改代碼以包括這些因素,因此我可以收集樣式字段。

返回對象具有多個類。 當使用if () ,括號中的表達式需要計算為一個邏輯長度(或可以強制為1的值)。 如果該類的長度> 1,那么您將獲得一個長度> 1的邏輯向量,這將導致您看到警告 例如:

foo <- 1:10
class(foo)
class(foo) <- c("myclass", class(foo))
class(foo)
class(foo) == "myclass"

> class(foo)
[1] "integer"
> class(foo) <- c("myclass", class(foo))
> class(foo)
[1] "myclass" "integer"
> class(foo) == "myclass"
[1]  TRUE FALSE
> if(class(foo) == "myclass") {
+ print("Yay!")
+ }
[1] "Yay!"
Warning message:
In if (class(foo) == "myclass") { :
  the condition has length > 1 and only the first element will be used

測試類成員資格的正確方法是通過類的繼承。 這是通過使用inherits()函數完成的:

> if(inherits(foo, "myclass")) {
+ print("Yay!")
+ }
[1] "Yay!"

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM