繁体   English   中英

在 R 中循环通过 URL

[英]Loop through URL in R

我一直在尝试为西班牙的连锁酒店 Melia 分析来自 Tripadvisor 的酒店。 选择了 35 家酒店后,我们想使用一个代码来创建一个 dataframe 循环遍历 URL,而不是手动逐个进行。

我们创建了基本代码,其中包括我们想要的酒店之一的信息。 (复制在下面)但是,虽然获得 session 的 url 是可能的,但我们的选择更加“随机”并且它不是网页类别之一(例如他们有一个类别是西班牙的酒店,但没有我们酒店的清单)。 你知道实现这一目标的方法吗?

刮泡泡

在此处输入图像描述

我们确实无法获得星星的代码。 这是我们在脚本之前使用的,以及我们找到的脚本。

url2 <- "https://www.tripadvisor.es/Hotel_Review-g187499-d239247-Reviews-Melia_Girona-Girona_Province_of_Girona_Catalonia.html"
TripHotel<-read_html(url2)

Estrellas <- TripHotel %>%
  html_nodes("_2TmwtWEr _30WZSV_9 uq1qMUbD") %>%
  html_text()
Estrellas


The script we get for StarRating is window._WEB_CONTEXT_={pageManifest:{"assets":["/components/dist/@ta/platform.polyfill-web-minimal.8629eb7b79.js","/components/dist/runtime.fe87536ca9.js","/components/dist/@ta/platform.runtime.e329407de2.js","/components/dist/vendor-babel.d45f1e0c4b.js","/components/dist/vendor-react-libs.7a36afd64a.js","/components/dist/vendor-redux-libs.aa893c3e8e.js","/components/dist/ta-platform.69750d80c2.js","/components/dist/lithium-platform.96ba6a9bc0.js","/components/dist/@ta/common.features.2a6979d658.js","/components/dist/@ta/platform.sentry.8f209cb2ac.js","/components/dist/@ta/platform.interactions.79fdeef568.js","/components/dist/vendor-apollo-libs.47c5785823.js","/components/dist/vendor-common.14a97ab607.js","/components/dist/@ta/common.responsive.4c9b587da5.js","/components/dist/lithium-routes.ea28cf22de.js","/components/dist/@ta/platform.monikers.cbbb67e0e2.js","/components/dist/@ta/core-ui.toast.d06b307144.js","/components/dist/@ta/platform.ssr-cache.49252e32a2.js","/components/dist/@ta/login.auth-gate-original.ef5977ab0d.js","/components/dist/@ta/login.auth-gate-lithium.43308c8956.js","/components/dist/@ta/login.login.b2621e9aa7.js","/components/dist/lithium-common.8e2735e535.js","/components/dist/vendor-urql.c12714ee06.js","/components/dist/vendor-lodash-libs.5c5eb190c8.js","/components/dist/@ta/common.object-util.c3bc3badfd.js","/components/dist/tslib.e90a942e35.js","/components/dist/vendor-react-dom-server.ce31892dfe.js","/components/dist/@ta/common.transitions.34c0d6a9f3.js","/components/dist/@ta/common.webview.3291fba6a5.js","/components/dist/@ta/modal.styleguide.7be39367d0.js","/components/dist/@ta/events.window.e20a52ae75.js","/components/dist/ta-common.b8bb120511.js","/components/dist/@ta/overlays.popover.085dd7b789.js","/components/dist/@ta/modal.fullscreen.21e9eb7e09.js","/components/dist/@ta/modal.vanilla.13e6154218.js","/components/dist/@ta/input.drop-zone.7afa95149d.js","/components/dist/@ta/media-viewer.types.6010627a44.js","/components/dist/react-transition-group.bdcd384ecd.js","/components/dist/@ta/overlays.pieces.31662725af.js","/components/dist/@ta/overlays.attached-arrow-overlay.b8013ac42f.js","/components/dist/@ta/modal.core.1d1febe0c4.js","/components/dist/@ta/overlays.attached-overlay.9f19a478e2.js","/components/dist/@ta/overlays.shift.63c6c0d933.js","/components/dist/@ta/events.keyboard-event-listener.cf353a32be.js","/components/dist/@ta/a11y.focus.b656b98684.js","/components/dist/@ta/events.window-resize.e97c27ed34.js","/components/dist/@ta/platform.runtime.e7e9ab5e5c.css","/components/dist/ta-platform.f5afcf1468.css","/components/dist/@ta/platform.monikers.80f1d43c6e.css","/components/dist/@ta/core-ui.toast.4dc70a9cf8.css","/components/dist/@ta/login.auth-gate-lithium.59bf39748e.css","/components/dist/lithium-common.a4449756a1.css","/components/dist/@ta/common.webview.dd67304f49.css","/components/dist/ta-common.fa2b80cda8.css","/components/dist/@ta/overlays.popover.bc2c0648bc.css","/components/dist/@ta/modal.fullscreen.a01c6b7342.css","/components/dist/@ta/modal.vanilla.dc35b80706.css","/components/dist/@ta/input.drop-zone.a2a1035b2d.css","/components/dist/@ta/overlays.pieces.5252ffa89a.css","/components/dist/@ta/modal.core.15bfe91f4f.css","/components/dist/t4b.t4b-header.beca207473.js","/components/dist/@ta/common.data-cacher.6b1422d012.js","/components/dist/@ta/t4b.fullstory.ad384c9af1.js","/components/dist/@ta/hrm.qualaroo-survey.dc06f3d1e6.js","/components/dist/@ta/hrm.subscription-utils.3f8a85d442.js","/components/dist/t4b.t4b-header.41189e4309.css","/components/dist/t4b.t4b-header-mw.77987e5a99.js","/components/dist/@ta/overlays.slide-in.54e1061c97.js","/components/dist/@ta/modal.slide-in.6be2335380.js","/components/dist/t4b.t4b-header-mw.a02bc6e3ad.css","/components/dist/@ta/modal.slide-in.bd9d180953.css","/components/dist/@ta/brand.footer.0ffbbc5cbd.js","/components/dist/@ta/core-ui.responsive-container.d2340afaf7.js","/components/dist/@ta/core-ui.link.cad10f6ef5.js","/components/dist/@ta/cpm.consent-mgmt.5505c307af.js","/components/dist/@ta/overlays.responsive-attached-overlay.6a62c30680.js","/components/dist/@ta/modal.headers.883409ed29.js","/components/dist/@ta/brand.breadcrumbs.4facfa91e7.js","/components/dist/@ta/seo.on-page-factors.6a471f02c0.js","/components/dist/@ta/cpm.universal-consent-platform-loader.d52599cc36.js","/components/dist/@ta/common.breadcrumbs.7bfa701f29.js","/components/dist/@ta/brand.nav-links.6459b150b0.js","/components/dist/@ta/brand.context.0db13f31d7.js","/components/dist/@ta/common.json-ld.cdce1b4d51.js","/components/dist/@ta/common.obfuscated-link.971bde5bac.js","/components/dist/@ta/common.text-html.432c62dc8c.js","/components/dist/@ta/common.client.e7412d80a4.js","/components/dist/@ta/brand.footer.eba6478b2c.css","/components/dist/@ta/core-ui.responsive-container.6a3e9d8142.css","/components/dist/@ta/core-ui.link.0f5c30e3f8.css","/components/dist/@ta/modal.headers.b268aa40c7.css","/components/dist/@ta/brand.breadcrumbs.7afc981777.css","/components/dist/@ta/common.breadcrumbs.a00e58e432.css","/components/dist/@ta/common.obfuscated-link.a0e0d02491.css","/components/dist/@ta/hotels.hr-about-amenities.9b050ed5d6.js","/components/dist/@ta/common.ssronly.e4be79bac9.js","/components/dist/@ta/styleguide.skeleton.b5edd4f833.js","/components/dist/@ta/hotels.hr-about-layout.454cd75b1c.js","/components/dist/@ta/styleguide.grouping-tabs.0c0c441774.js","/components/dist/@ta/hotels.tags.5bba931453.js","/components/dist/@ta/hotels.hr-about-amenities.40faf03e31.css","/components/dist/@ta/styleguide.skeleton.9ef3c38c68.css","/components/dist/@ta/hotels.hr-about-layout.49df83afe1.css","/components/dist/@ta/styleguide.grouping-tabs.38225b9733.css","/components/dist/brand.header/LithiumGlobalNavWrapper.demand.a8d9d8e6ba.js","/components/dist/@ta/brand.header.2b51087b14.js","/components/dist/@ta/brand.global-nav.02e9fa460e.js","/components/dist/@ta/core-ui.avatar.223f2ac085.js","/components/dist/@ta/memx.explicit-preferences-lithium.a313781717.js","/components/dist/@ta/onboarding.explicit-preferences-external-tracking.d784e24e65.js","/components/dist/@ta/brand.sponsored-geopill.c829aea775.js","/components/dist/@ta/search.typeahead.d08803f3ee.js","/components/dist/@ta/common.hover-chain.27bdc86591.js","/components/dist/@ta/styleguide.nav-tabs.77f1cb9ccb.js","/components/dist/@ta/memx.attr-promo-screen.5f64728279.js","/components/dist/@ta/onboarding.common.1d6a1bf224.js","/components/dist/@ta/onboarding.text-input.c9b9ad73f7.js","/components/dist/@ta/cpm.y8p4bb.01610be7f6.js","/components/dist/@ta/platform.routing.2a83cbc988.js","/components/dist/@ta/common.sessionstorage.91e1dbe02e.js","/components/dist/@ta/common.localstorage.a16fd2c438.js","/components/dist/@ta/common.typeahead.16ded082ae.js","/components/dist/@ta/list.navigable-list.3d2c6813ec.js","/components/dist/@ta/cpm.gpt-slots-controller.58f3976be7.js","/components/dist/@ta/cpm.rubicon-header-bidding.8f9f909334.js","/components/dist/@ta/events.window-visibility.91a17522ae.js","/components/dist/@ta/cpm.utils.45611a3edb.js","/components/dist/@ta/common.keyboardCodes.b2e4a5f8ad.js","/components/dist/@ta/input.text-input.2ab20c2563.js","/components/dist/@ta/cpm.adomik-slot-params.fe9b20aa6b.js","/components/dist/brand.header/LithiumGlobalNavWrapper.demand.a557b85179.css","/components/dist/@ta/brand.header.08133f1070.css","/components/dist/@ta/brand.global-nav.59fa1be068.css","/components/dist/@ta/core-ui.avatar.84d9d7bfb8.css","/components/dist/@ta/memx.explicit-preferences-lithium.f9d958fcd2.css","/components/dist/@ta/search.typeahead.1a69e046ec.css","/components/dist/@ta/memx.attr-promo-screen.e1370082f6.css","/components/dist/@ta/onboarding.common.e025519e12.css","/components/dist/@ta/onboarding.text-input.a91711d401.css","/components/dist/@ta/cpm.y8p4bb.d15f5fa775.css","/components/dist/@ta/common.typeahead.9453bd61be.css","/components/dist/@ta/input.text-input.948597fce8.css","/components/dist/@ta/hotels.hr-about-partial-match-filters.5157516ea8.js","/components/dist/@ta/hotels.hr-about-partial-match-filters.f4af8d1c27.css","/components/dist/@ta/hotels.hotel-review-about-csr.706b3fa8d2.js","/components/dist/@ta/overlays.tooltip.d87c721909.js","/components/dist/@ta/common.ajax.fbfa19c52b.js","/components/dist/@ta/common.text.b9ff6f2848.js","/components/dist/@ta/hotels.partner-content-shared-components.6a5a8fa946.js","/components/dist/@ta/common.photo-carousel.2abbd26e14.js","/components/dist/@ta/hotels.media-album-parts.268c77863a.js","/components/dist/@ta/media-viewer.opener.61bfb36637.js","/components/dist/@ta/business-advantage.use-hotel-travel-info.3c53e0b232.js","/components/dist/@ta/public.business-listing.dd1cc04579.js","/components/dist/@ta/hotels.hotel-offers.01b5407a3b.js","/components/dist/@ta/common.polygon.b15ea13de1.js","/components/dist/@ta/common.loadable.fd63e87a36.js","/components/dist/@ta/common.indicator-dots.dd848da747.js","/components/dist/@ta/common.accumulating-provider.834c99a063.js","/components/dist/@ta/public.graphql-util.618d630be3.js","/components/dist/@ta/hotels.media-album-configuration.cd0d532a86.js","/components/dist/@ta/media.metrics.c624a57f8e.js","/components/dist/@ta/common.cookie.35892691a9.js","/components/dist/@ta/media-viewer.hash-utils.e7e7fd1e02.js","/components/dist/@ta/media-viewer.tracking.03ca8bde05.js","/components/dist/@ta/common.dates.ea8681abcb.js","/components/dist/@ta/business-advantage.special-offer-lightbox.3408f3a4f5.js","/components/dist/@ta/overlays.modal.1061eb3223.js","/components/dist/@ta/public.dataroots.c36e49c3fc.js","/components/dist/@ta/public.data-api-polling.a11e0ebf6b.js","/components/dist/@ta/common.metrics.3e79a9ec7c.js","/components/dist/@ta/business-advantage.special-offer-lightbox-overlay.9bbb115255.js","/components/dist/@ta/business-advantage.special-offer-details-provider.bf58636ab5.js","/components/dist/@ta/business-advantage.special-offer-carousel.bac746ea9f.js","/components/dist/@ta/business-advantage.carousel.08ab0fcd2e.js","…

纬度和经度

在此处输入图像描述

在这里,我们获得了位置,但我们既无法获得经度也无法获得纬度。

更新代码

“data.frame 中的错误(酒店 = 酒店,位置 = 位置,整体评级 = 整体评级,:arguments 暗示不同的行数:1、0”

   getwd()
# Limpiamos la consola
cat("\014")

library(rvest)
library(plyr)
library(purrr)
library(stringr)

url2 <- "https://www.tripadvisor.es/Hotel_Review-g187499-d239247-Reviews-Melia_Girona-Girona_Province_of_Girona_Catalonia.html"
TripHotel<-read_html(url2)

getHotel <- function(TripHotel){
Hotel <- TripHotel %>%
  html_node(".masthead_h1") %>%
  html_text()
Hotel


TotalReviews <- TripHotel %>%
  html_nodes(".hotels-hotel-review-atf-info-parts-Rating__reviewCount--1sk1X") %>%
  html_text()
TotalReviews


Location <- TripHotel %>%
  html_nodes(".hotels-hotel-review-atf-info-parts-BusinessListingEntry__address--1Vy86") %>%
  html_text()
Location

OverallRating <- TripHotel %>%
  html_nodes(".hotels-hotel-review-about-with-photos-Reviews__overallRating--vElGA") %>%
  html_text()
  as.numeric()
OverallRating

ratingText <- TripHotel %>%
  html_nodes(".hotels-hotel-review-about-with-photos-Reviews__subratingRow--2u0CJ") %>%
  html_text()
ratingText

nodes <- read_html(url2) %>%
  html_nodes("[class*=subratingRow]")
nodes

ratingServiceValue <- map_df(nodes, function(node) {
  
  data.frame(rating = str_match(node%>%html_node('span')%>%html_attr('class'),'\\d+')%>%as.integer()/10,
             stringsAsFactors=FALSE)
})
ratingServiceValue

Ubicación <- ratingServiceValue[1,]
Limpieza <- ratingServiceValue[2,]
Servicio <- ratingServiceValue[3,]
Calidad_Precio <- ratingServiceValue[4,]

HotelCategory <- TripHotel %>%
  html_nodes(".hotels-hotel-review-about-with-photos-Reviews__subratingRow--2u0CJ, ui_bubble_rating") %>%
  html_text()
HotelCategory

Estilo <- TripHotel %>%
  html_node(".hotels-hr-about-layout-TextItem__textitem--2JToc+ .hotels-hr-about-layout-TextItem__textitem--2JToc") %>%
  html_text()
Estilo

HotelsDef <- data.frame(Hotel = Hotel, 
                        Location = Location, 
                        OverallRating = OverallRating, 
                        Ubicación = Ubicación, 
                        Limpieza = Limpieza, 
                        Servicio = Servicio, 
                        Calidad_Precio = Calidad_Precio, 
                        Estilo = Estilo, 
                        TotalReviews = TotalReviews)
HotelsDef
}

#CREAMOS UN DATASET CONJUNTO

melia1 <- read_html("melia.html")

melia1 %>% 
  html_nodes(css = "a[href]") %>% 
  html_attr("href") %>% 
  Filter(f = function(.) !grepl("javascript:void", .)) %>% 
  paste0("https://tripadvisor.com", .) %>% 
  cat(sep = "\n")

melia2 <- read_html("melia2.html")

melia2 %>% 
  html_nodes(css = "a[href]") %>% 
  html_attr("href") %>% 
  Filter(f = function(.) !grepl("javascript:void", .)) %>% 
  paste0("https://tripadvisor.com", .) %>% 
  cat(sep = "\n")

melia3 <- read_html("melia3.html")

melia3 %>% 
  html_nodes(css = "a[href]") %>% 
  html_attr("href") %>% 
  Filter(f = function(.) !grepl("javascript:void", .)) %>% 
  paste0("https://tripadvisor.com", .) %>% 
  cat(sep = "\n")

melia4 <- read_html("melia4.html")

melia4 %>% 
  html_nodes(css = "a[href]") %>% 
  html_attr("href") %>% 
  Filter(f = function(.) !grepl("javascript:void", .)) %>% 
  paste0("https://tripadvisor.com", .) %>% 
  cat(sep = "\n")

  #Los unimos

  get30hotels <- function(melia1){
    data <- data.frame()
    i=1
    for(i in 1:length(melia1)){
      TripHotel <- read_html(paste0("https://www.tripadvisor.com", melia1[i]))
      HotelsDef <- getHotel(TripHotel)
      data <- rbind.fill(data, HotelsDef)
      print(i)
    }
    data
  }
  
  HOTELS <- get30hotels(melia1)
  

太感谢了!

访问https://www.tripadvisor.com/Search?q=melia&geo=187427&ssrc=h&rf=1 (网址参数选择网站手动搜索)

打开浏览器开发者工具。 Go 到网络选项卡。 刷新。 寻找数据。 我们得到的最接近的是在此处输入图像描述

“复制响应”并将其保存为本地 html 文件 melia melia-tripadvisor.html

提取30个链接

library(rvest)

doc <- read_html("melia-tripadvisor.html")

doc %>% 
  html_nodes(css = "a[href]") %>% 
  html_attr("href") %>% 
  Filter(f = function(.) !grepl("javascript:void", .)) %>% 
  paste0("https://tripadvisor.com", .) 

接下来的 30 个链接可以通过访问搜索结果的第 2 页https://www.tripadvisor.com/Search?q=melia&geo=187427&ssrc=h&rf=2 (注意rf=2 )并复制上述步骤来提取


假设已经收集了 4 个这样的 HTML 文件,命名为melia1.htmlmelia2.html等。

getHotel中,通过替换为nodes <- TripHotel %>%来修复nodes <- read_html(url2) %>% >%

然后

n_files <- 4

files <- paste0("melia", seq_len(n_files), ".html")

extract_links <- function(html_file) {
  read_html(html_file) %>%
    html_nodes(css = "a[href]") %>%
    html_attr("href") %>%
    Filter(f = function(.) !grepl("javascript:void", .)) %>%
    paste0("https://tripadvisor.com", .)
}

links <- unlist(lapply(files, extract_links))

HOTELS <- 
  links %>% 
  map(read_html) %>% 
  map_dfr(getHotel) 

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM