简体   繁体   中英

BeautifulSoup - how do i scrape multiple links to then scrape contents of links

I'm trying to do a scrape where the landing page has various links (the 5 sub categories at the top): https://mcavoyguns.co.uk/contents/en-uk/d410_New_Browning_over___under_shotguns.html

Within each of these categories are a list of products https://mcavoyguns.co.uk/contents/en-uk/d411_Browning_B725_Shotguns.html

Each product listed has a link to get further details (a direct link to the product as an individual page) https://mcavoyguns.co.uk/contents/en-uk/p74600_Browning-B725-Sporter-over-_-under.html

The scrape I've put together so far will get as far as creating a list of all the individual page links required. But when I try to loop each individual product link for data, I cant seem to get the BeautifulSoup to map the data from those links. Its as though it stays on the previous page (if you will).
What am I missing to allow for that second "bounce" to the "product_link" address (eg https://mcavoyguns.co.uk/contents/en-uk/p74600_Browning-B725-Sporter-over-_-under.html ) and allow me to scrape the data from there? I had thought I might need to add a time.sleep(5) timer to allow for all to load but still getting nothing.

Code:

from bs4 import BeautifulSoup 
import math 
import requests 
import shutil 
import csv 
import pandas 
import numpy as np 
from pandas import DataFrame 
import re
import os 
import urllib.request as urllib2 
import locale 
import json 
from selenium import webdriver 
import lxml.html 
import time 
from selenium.webdriver.support.ui import Select  
os.environ["PYTHONIOENCODING"] = "utf-8" 


#selenium requests 

browser = webdriver.Chrome(executable_path='C:/Users/admin/chromedriver.exe')
browser.get("https://mcavoyguns.co.uk/contents/en-uk/d410_New_Browning_over___under_shotguns.html") 
time.sleep(2) 

all_Outlinks=[] 
all_links=[]

soup = BeautifulSoup(browser.page_source, features="lxml") 
submenuFind = soup.find("div", "idx2Submenu") 
submenuItems = submenuFind.find_all("li", "GC34 idx2Sub") 

for submenuItem in submenuItems: 
    for link in submenuItem.select('a[href]'): 
        all_Outlinks.append("https://mcavoyguns.co.uk/contents/en-uk/" + link['href']) 
#print(all_Outlinks) 

for a_link in all_Outlinks:
    res = requests.get(a_link) 
    soup = BeautifulSoup(res.text, 'html.parser') 
    pageLinkDivs = soup.find_all("div", "column full")
    for pageLinkDiv in pageLinkDivs:
        for pageLink in pageLinkDiv.select('a[href]'):
            all_links.append("https://mcavoyguns.co.uk/contents/en-uk/" + pageLink['href'])
#print(all_links)
            
for product_link in all_links:
    time.sleep(5)
    resSecond = requests.get(product_link)
    soup = BeautifulSoup(resSecond.text, 'html.parser')
    model = soup.find("div", "GC75 ProductChoiceName")
    print(model)

PS Apologies for the additional imports. They are copy and paste from a previous script to be removed once confirmed not required.

That info is pulled dynamically from a script tag when using browser. As using requests this will not be in the location you might be looking. Instead, pull that info from the script tag.

In this case, I pull all the info related to a given model that is within the script and generate a dataframe. I convert the string inside the script tag to a python object with ast . I add the product url and product title to the dataframe.

Each df is added to a list which is converted to a final dataframe. As I don't know what final header names would be required I have left some with their default names.

I have added in handling for the case(s) where there are no model options listed for the given product.


from bs4 import BeautifulSoup 
import math 
import requests 
import shutil 
import csv 
import pandas as pd
import numpy as np 
import re
import os 
import urllib.request as urllib2 
import locale 
import json 
from selenium import webdriver 
import lxml.html 
import time 
from selenium.webdriver.support.ui import Select  
import ast

os.environ["PYTHONIOENCODING"] = "utf-8" 

#selenium requests 
browser = webdriver.Chrome(executable_path='C:/Users/admin/chromedriver.exe')
browser.get("https://mcavoyguns.co.uk/contents/en-uk/d410_New_Browning_over___under_shotguns.html") 
time.sleep(2) 

all_Outlinks=[] 
all_links=[]

soup = BeautifulSoup(browser.page_source, features="lxml") 
submenuFind = soup.find("div", "idx2Submenu") 
submenuItems = submenuFind.find_all("li", "GC34 idx2Sub") 

for submenuItem in submenuItems: 
    for link in submenuItem.select('a[href]'): 
        all_Outlinks.append("https://mcavoyguns.co.uk/contents/en-uk/" + link['href']) 
#print(all_Outlinks) 

with requests.Session() as s:
    
    for a_link in all_Outlinks:
        res = requests.get(a_link) 
        soup = BeautifulSoup(res.text, 'html.parser') 
        pageLinkDivs = soup.find_all("div", "column full")
        for pageLinkDiv in pageLinkDivs:
            for pageLink in pageLinkDiv.select('a[href]'):
                all_links.append("https://mcavoyguns.co.uk/contents/en-uk/" + pageLink['href'])
    
    results = []
    
    for product_link in all_links:
        # print(product_link)
        resSecond = s.get(product_link)
        soup = BeautifulSoup(resSecond.text, 'html.parser')
        title = soup.select_one('.ProductTitle').text
        
        try:
            df = pd.DataFrame(ast.literal_eval(re.search(r'(\[\[.*\]\])', soup.select_one('.ProductOptions script').string).groups(0)[0]))
            df.iloc[:, -1] = product_link
        except:
            placeholder = ['No options listed'] * 8
            placeholder.append(product_link)
            df = pd.DataFrame([placeholder])
        
        df.insert(0, 'title', title)
        
        #print(df) # add headers you care about to df or do that at end on full list
        results.append(df)
final = pd.concat(results) # or add header here
print(final)

You could then look at speeding/tidying things up:

from bs4 import BeautifulSoup 
import requests 
import pandas as pd
import re
import os 
import locale 
from selenium import webdriver 
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC 
import ast
from multiprocessing import Pool, cpu_count

def get_models_df(product_link):
    res = requests.get(product_link)
    soup = BeautifulSoup(res.text, 'lxml')
    title = soup.select_one('.ProductTitle').text

    try:
        df = pd.DataFrame(ast.literal_eval(re.search(r'(\[\[.*\]\])', soup.select_one('.ProductOptions script').string).groups(0)[0]))
        df.iloc[:, -1] = product_link
    except:
        placeholder = ['No options listed'] * 8
        placeholder.append(product_link)
        df = pd.DataFrame([placeholder])

    df.insert(0, 'title', title)
    return(df)


def get_all_pages(a_link):
    res = requests.get(a_link) 
    soup = BeautifulSoup(res.text, 'lxml') 
    all_links = ["https://mcavoyguns.co.uk/contents/en-uk/" + i['href'] for i in soup.select('.center-content > a')]   
    return all_links

if __name__ == '__main__':
    os.environ["PYTHONIOENCODING"] = "utf-8" 

    #selenium requests 
    browser = webdriver.Chrome(executable_path='C:/Users/admin/chromedriver.exe')
    browser.get("https://mcavoyguns.co.uk/contents/en-uk/d410_New_Browning_over___under_shotguns.html") 
    all_outlinks = [i.get_attribute('href') for i in WebDriverWait(browser,10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".idx2Submenu a")))]
    browser.quit()
    
    with Pool(cpu_count()-1) as p:

        nested_links = p.map(get_all_pages , all_outlinks)
        flat_list = [link for links in nested_links for link in links]   
        results = p.map(get_models_df, flat_list)
        final = pd.concat(results)
        #print(final)
        final.to_csv('guninfo.csv', encoding='utf-8-sig', index = False)

So I said I would have a look at the other requested items and they are indeed available just with requests . Some things that needed handling:

  1. Different headers present for different products; some missing headers
  2. Some unicode characters (there are still some encoding things to look at)
  3. Handling cases where description missing
  4. Handling the more section
  5. Updating certain output values so Excel doesn't convert them to dates
  6. Handling of header nan

TODO:

  1. One of the functions has now become a rabid monster and needs re-factoring into smaller friendly function calls.

from bs4 import BeautifulSoup 
import requests 
import pandas as pd
import re
import os 
import locale 
from selenium import webdriver 
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC 
import ast
from multiprocessing import Pool, cpu_count
import numpy as np
import unicodedata

def get_models_df(product_link):

    resSecond = requests.get(product_link)
    soup = BeautifulSoup(resSecond.text, 'lxml')
    title = soup.select_one('.ProductTitle').text

    try:
        df = pd.DataFrame(ast.literal_eval(re.search(r'(\[\[.*\]\])', soup.select_one('.ProductOptions script').string).groups(0)[0]))
        
    except:
        placeholder = ['No options listed'] * 8
        df = pd.DataFrame([placeholder])
    
    df.insert(0, 'title', title)
    df['price'] = ' '.join([soup.select_one("[property='product:price:amount']")['content'], 
                   soup.select_one("[property='product:price:currency']")['content']])
    df['weight'] = ' '.join([soup.select_one("[property='product:weight:value']")['content'], 
                    soup.select_one("[property='product:weight:units']")['content']])

    output_headers = ['Action frame', 'Barrel','Barrel finish','Barrel length', 
                      'Barrel length (mm-inch)','Buttstock','Calibre','Chokes','Code',
                      'Drop at comb','Drop at heel','Forearm','Length','N/A','Notes',
                      'Options','Packaging','Sights','Stock style','Top rib','Weight','Wood','Wood grade'
                     ]
    
    df = pd.concat([df, pd.DataFrame(columns = output_headers)])
    
    try:
        description_table = pd.read_html(str(soup.select_one('.ProductDetailedDescription table, table')))[0].transpose()
        description_table.dropna(axis=0, how='all',inplace=True)
        headers = list(description_table.iloc[0,:])
        headers[:] = ['N/A' if pd.isnull(np.array([header], dtype=object)) else header for header in headers]
        
        for number, header in enumerate(headers):
            temp = header.lower()
            value = description_table.iloc[1, number]
            if temp == 'calibre':
                df[header] = "'" + value
            elif  temp == 'top rib' and 'mm' not in value:
                df[header] = value + 'mm'
            else:
                df[header] = value
     
    except:
        pass # no table
        
    description = soup.select_one('#ProductDetailsTab [title=More]')
    
    if description is None:
        desc = 'N/A'
    else:
        desc = '. '.join([i.text for i in soup.select('.ProductDescription li, .ProductDescription span') if i.text !=''])
        if desc == '':
            desc = soup.select_one('.ProductIntroduction').get_text()

    df['desc'] = unicodedata.normalize('NFKD', desc)   
    df['product_link'] = product_link
    
    return(df)

def get_all_pages(a_link):
        
    res = requests.get(a_link) 
    soup = BeautifulSoup(res.text, 'lxml') 
    all_links = ["https://mcavoyguns.co.uk/contents/en-uk/" + i['href'] for i in soup.select('.center-content > a')]

    return all_links

if __name__ == '__main__':
    #os.environ["PYTHONIOENCODING"] = "utf-8" 

    #selenium requests 
    browser = webdriver.Chrome()# executable_path='C:/Users/admin/chromedriver.exe')
    browser.get("https://mcavoyguns.co.uk/contents/en-uk/d410_New_Browning_over___under_shotguns.html") 
    all_outlinks = [i.get_attribute('href') for i in WebDriverWait(browser,10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".idx2Submenu a")))]
    browser.quit()

    with Pool(cpu_count()-1) as p:

        nested_links = p.map(get_all_pages , all_outlinks)
        flat_list = [link for links in nested_links for link in links]
        results = p.map(get_models_df, flat_list)
        final = pd.concat(results)
        #print(final)
        final.to_csv('guninfo.csv', encoding='utf-8-sig', index = False)
        
        

As QHarr pointed out Selenium was the answer. This gave me the direction to look at it with different eyes and allowed me to find the answer.

I'm posting as my answer but crediting @QHarr with the work based on work provided previous and the ongoing assistance to help lead to the solution.

from bs4 import BeautifulSoup
import math
import requests
import shutil
import csv
import pandas
import numpy as np
from pandas import DataFrame
import re
import os
import urllib.request as urllib2
import locale
import json
from selenium import webdriver
import lxml.html
import time
from selenium.webdriver.support.ui import Select 
os.environ["PYTHONIOENCODING"] = "utf-8"

#selenium requests
browser = webdriver.Chrome(executable_path='C:/Users/andrew.glass/chromedriver.exe')
browser.get("https://mcavoyguns.co.uk/contents/en-uk/d410_New_Browning_over___under_shotguns.html") 
time.sleep(2) 

all_Outlinks=[] 
all_links=[]

soup = BeautifulSoup(browser.page_source, features="lxml") 
submenuFind = soup.find("div", "idx2Submenu") 
submenuItems = submenuFind.find_all("li", "GC34 idx2Sub") 

for submenuItem in submenuItems: 
    for link in submenuItem.select('a[href]'): 
        all_Outlinks.append("https://mcavoyguns.co.uk/contents/en-uk/" + link['href']) 
#print(all_Outlinks) 

for a_link in all_Outlinks:
    res = requests.get(a_link) 
    soup = BeautifulSoup(res.text, 'html.parser') 
    pageLinkDivs = soup.find_all("div", "column full")
    for pageLinkDiv in pageLinkDivs:
        for pageLink in pageLinkDiv.select('a[href]'):
            all_links.append("https://mcavoyguns.co.uk/contents/en-uk/" + pageLink['href'])
#print(all_links)
            
for product_link in all_links:
    
    browser.get(product_link)
    time.sleep(5)
    soup = BeautifulSoup(browser.page_source, 'html.parser')
    model = soup.find("div", "GC65 ProductOptions")
    modelFind = soup.find('select', attrs={'name': re.compile('model')})
    modelList = [x['origvalue'][:14] for x in modelFind.find_all('option')[1:]]
    print(modelList)

Model print still a bit messy but can clean it up once get all requirements gathered.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM