简体   繁体   中英

How to combine multiple pages from a website into one data frame in r?

Desired

I want to capture all the winners and their opponents from the ibjjf website. This includes details like their division, belt color, gym, rank, etc. I'm doing this by scraping the branches from within each link on this page , and putting it into a dataframe. Before creating a loop, I scraped one page to test out the code I wrote, and it worked exactly as I wanted it to.

This is what the end data frame should look like:

原始数据框

Problems

I created a loop to capture this information from multiple pages. The division, belt, gender, and weight all only appear once on each page. However, when I ran my code on just one page, r automatically filled in the correct information for each competitor. I would like it to automatically fill in the appropriate information for all pages. For example, if the belt color is Black, I want r to fill in black next to each competitor name that appears on that page. Just like in the photo.

Here is a sample of the code I wrote for the loop:

library(rvest)
library(tidyverse)

# CREATE EMPTY VECTORS ---------------------------------------
# fight info vectors
division_all = c()
gender_all = c()
belt_all = c()
weight_all = c()
fightAndMat_all = c()
date_all = c()
competitor_all = c()
name_all = c()
gym_all = c()

# Create for loop -----------------------------------
for (i in 46:70){
  MensUrl <- read_html(paste0('https://www.bjjcompsystem.com/tournaments/1869/categories/20531', i))

# SCRAPE THE DATA #############################################
## SCRAPE FIGHT INFO -------------------------------------------
# fight info
division <- MensUrl %>% 
  html_nodes('.category-title__age-division') %>% 
  html_text()

gender <- MensUrl %>% 
  html_nodes('.category-title__age-division+ .category-title__label') %>% 
  html_text()

belt <- MensUrl %>% 
  html_nodes('.category-title__label:nth-child(3)') %>% 
  html_text()

weight <-  MensUrl %>% 
  html_nodes('.category-title__label:nth-child(4)') %>% 
  html_text()

fightAndMat <- MensUrl %>% 
  html_nodes('.bracket-match-header__where , .bracket-match-header__fight') %>% 
  html_text()

date = MensUrl %>% 
  html_nodes('.bracket-match-header__when') %>% 
  html_text()

competitor = MensUrl %>% 
  html_nodes('.match-card__competitor-n') %>% 
  html_text()

name = MensUrl %>% 
  html_nodes('.match-card__competitor-name') %>% 
  html_text()

gym = MensUrl %>% 
  html_nodes('.match-card__club-name') %>% 
  html_text()

# append fight info ----

division_all = append(division_all,division)
gender_all = append(gender_all,gender)
belt_all = append(belt_all,belt)
weight_all = append(weight_all, weight)
fightAndMat_all = append(fightAndMat_all, fightAndMat)
date_all = append(date_all,date)
competitor_all = append(competitor_all,competitor)
name_all = append(name_all,name)
gym_all = append(gym_all, gym)
}

# CONVERT TO DATA FRAMES --------------------------------------------------

#### create match df ####
matches = data.frame('division' = division_all,
                     'gender' = gender_all,
                     'belt' = belt_all,
                     'weight' = weight_all,
                     'fightAndMat' = fightAndMat_all,
                     'date' = date_all,
                     'competitor' = competitor_all,
                     'name' = name_all,
                     'gym' = gym_all)

> Error in data.frame(division = division_all, gender = gender_all, belt = belt_all,  : 
  arguments imply differing number of rows: 25, 1818, 909, 2601, 2207

I would suggest to use a list to store each element of an iteration.
And another list to store each list obtained.
Here is the beginning of the solution

library(rvest); library(tidyverse)

# CREATE EMPTY VECTORS ---------------------------------------
# fight info vectors
division_all = gender_all = belt_all = weight_all = fightAndMat_all = date_all = competitor_all = name_all = gym_all = c()
liste_df <- vector("list", 70- 46 + 1) # Initialize a list with 70 - 46 + 1 NULL elements
l <- 1
# Create for loop -----------------------------------
for (i in 46:70){
  MensUrl <- read_html(paste0('https://www.bjjcompsystem.com/tournaments/1869/categories/20531', i))
  
  # SCRAPE THE DATA #############################################
  ## SCRAPE FIGHT INFO -------------------------------------------
  # fight info
  division <- MensUrl %>% 
    html_nodes('.category-title__age-division') %>% 
    html_text()
  
  gender <- MensUrl %>% 
    html_nodes('.category-title__age-division+ .category-title__label') %>% 
    html_text()
  
  belt <- MensUrl %>% 
    html_nodes('.category-title__label:nth-child(3)') %>% 
    html_text()
  
  weight <-  MensUrl %>% 
    html_nodes('.category-title__label:nth-child(4)') %>% 
    html_text()
  
  fightAndMat <- MensUrl %>% 
    html_nodes('.bracket-match-header__where , .bracket-match-header__fight') %>% 
    html_text()
  
  date = MensUrl %>% 
    html_nodes('.bracket-match-header__when') %>% 
    html_text()
  
  competitor = MensUrl %>% 
    html_nodes('.match-card__competitor-n') %>% 
    html_text()
  
  name = MensUrl %>% 
    html_nodes('.match-card__competitor-name') %>% 
    html_text()
  
  gym = MensUrl %>% 
    html_nodes('.match-card__club-name') %>% 
    html_text()
  
  # THE CODE THAT IS NEW
  liste_df[[l]] <- list(division, gender, belt, weight, fightAndMat, date, competitor, name, gym) %>% 
    setNames(c("division", "gender", "belt", "weight", "fightAndMat", "date", "competitor", "name", "gym"))
  l <- l + 1
}

# WHAT TO DO NEXT?

A example of an element of liste_df :

> liste_df[[1]]
$division
[1] "\nMaster 1 \n"

$gender
[1] "\n\n Male \n"

$belt
[1] "\n\n BLUE \n"

$weight
[1] "\n\n Rooster\n"

$fightAndMat
 [1] "FIGHT 30: Mat 3" "FIGHT 30:"       "FIGHT 41: Mat 3" "FIGHT 41:"      
 [5] "FIGHT 32: Mat 3" "FIGHT 32:"       "FIGHT 35: Mat 3" "FIGHT 35:"      
 [9] "FIGHT 38: Mat 3" "FIGHT 38:"       "FIGHT 31: Mat 3" "FIGHT 31:"      
[13] "FIGHT 34: Mat 3" "FIGHT 34:"      

$date
[1] "Sat 09/03 at 01:04 PM" "Sat 09/03 at 02:33 PM" "Sat 09/03 at 01:16 PM"
[4] "Sat 09/03 at 01:54 PM" "Sat 09/03 at 02:10 PM" "Sat 09/03 at 01:10 PM"
[7] "Sat 09/03 at 01:37 PM"

$competitor
 [1] "1" "8" "3" "1" "2" "7" "4" "1" "3" "7" "4" "6" "3" "5"

$name
 [1] "Anthony Garcia"             "Jonathan Ti Shih Yeung"     "Tony Thai Nguyen"          
 [4] "Anthony Garcia"             "Daniel Armando Avilez"      "Philip Paul Johns Jr"      
 [7] "Emanuel Paul Q Babala"      "Anthony Garcia"             "Tony Thai Nguyen"          
[10] "Philip Paul Johns Jr"       "Emanuel Paul Q Babala"      "Wesley Allen Morris"       
[13] "Tony Thai Nguyen"           "Daryl Charles Ayson Santos"

$gym
 [1] "American Top Team"           "Evolve MMA"                 
 [3] "CheckMat"                    "American Top Team"          
 [5] "Absolute Behring"            "Form Jiu Jitsu Academy"     
 [7] "Lansang Brazilian Jiu-Jitsu" "American Top Team"          
 [9] "CheckMat"                    "Form Jiu Jitsu Academy"     
[11] "Lansang Brazilian Jiu-Jitsu" "Roberto Traven BJJ"         
[13] "CheckMat"                    "Yemaso Brazilian Jiu-Jitsu" 

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM