简体   繁体   中英

Scrape data using beautifulsoup

I am extracting the data they give repeat name and surname in each entry how ever the name and surname is different for each entry these is page link https://www.aeafa.es/asociados.php

import requests
import pandas as pd
from bs4 import BeautifulSoup

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"
}

temp = []
wev={}
for page in range(1, 5):
    r = requests.get(
        "https://www.aeafa.es/asociados.php?provinput=&_pagi_pg={page}".format(
            page=page
        ),
        headers=headers,
    )
    soup = BeautifulSoup(r.content, "lxml")
    details=soup.find('table',class_="table")
 
    for detail in details.find_all('tbody'):
       
        link = [up.text for up in detail.find_all("td")]
        name=link[0]
        wev['Nombre']=name
        surname=link[1]
        wev["Apellidos"]=surname
        
  
    tag = soup.find_all("div", class_="col-md-8 col-sm-8")
    for pro in tag:
        data = [tup.text for tup in pro.find_all("p")]
        
         
             
        Dirección = data[2]
        Dirección = Dirección[12:]
        wev[" Dirección"]= Dirección

        Población = data[3]
        Población = Población[14:]
        wev[" Población"]= Población

        Provincia = data[4]
        Provincia = Provincia[14:]
        wev["Provincia "]=Provincia 

        Teléfono = data[5]
        Teléfono = "+" + Teléfono[11:].replace(".", "")
        Teléfono=  Teléfono.replace("-", '')
        wev[" Teléfono"]= Teléfono



        Email = data[6]
        Email = Email[10:]
        wev["Email"]=  Email
        
        temp.append(wev)

df = pd.DataFrame(temp)
print(df)

They will print same name and surname in each entry how I correct it these is output

  Nombre          Apellidos                                                                           
0     JUAN  ARIAS   BARTOLOMÉ     
1     JUAN  ARIAS   BARTOLOM

One approach would be to merge the separate name and surname details into the data from the about information. A test could also be added for when the last page is reached:

import requests
import pandas as pd
from bs4 import BeautifulSoup
from unicodedata import normalize
import re

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"
}

page = 1
data1 = []
data2 = []

while True:
    print(f"Page {page}")
    r = requests.get(f"https://www.aeafa.es/asociados.php?provinput=&_pagi_pg={page}", headers=headers)
    page += 1
    
    soup = BeautifulSoup(r.content, "lxml")
    
    for pro in soup.find_all("div", class_="col-md-8 col-sm-8"):
        values = [re.sub(r'\s+', ' ', normalize('NFKD', p.get_text(strip=True))) for p in pro.find_all("p")]
        row = {'Sobre' : values[0][6:]}     # skip over the word Sobre
        
        for item in values[2:]:
            key, value = item.split(':', 1)
            row[key.strip()] = value.strip()
        
        row['Teléfono'] = row['Teléfono'].replace(".", "")
        data1.append(row)

    details = soup.find("table", class_="table").tbody
    
    for tr in details.find_all("tr"):
        data2.append([re.sub(r'\s+', ' ', normalize('NFKD', td.get_text(strip=True))) for td in tr.find_all("td")[:-1]])
        
    # Any more?
    ul = soup.find("ul", class_="pagination")
    last_li = ul.find_all("li")[-1]
    
    if last_li.text != "»":
        break

# Merge the name and surname from the second table
data = []

for d1, d2 in zip(data1, data2):
    data.append({'Nombre' : d2[0], 'Apellidos' : d2[1]} | d1)

df = pd.DataFrame(data)
print(df)

Giving you a dataframe starting:

              Nombre                        Apellidos                                      Sobre                                                 Dirección                         Población    Provincia                 Teléfono                                 E-mail                                          Web
0       JUAN MARIANO                          MERCADO                       Juan Mariano Mercado                                Juan de Toledo, no 16, 1o B                        30800 LORCA       Murcia                968-471716                 periagomer@hotmail.com                                             
1          Ma. BELEN                      ABAD GARCIA                      Ma. Belen Abad Garcia                                 Calle Constantino 33, 1o N                      4700 EL EJIDO     Almería     950487533 - 647936929       mariabelenabadgarcia@hotmail.com                                             
2             JESÚS                      ABAD MUÑIZ                         Jesús Abad Muñiz                                   Santiago, 15, 1o.- ctro.                              47001   Valladolid              98.320.20.11         jabad@carlosgallegoabogados.es                                             
3          Ma PALOMA                    ABAD TEJERINA                    Ma Paloma Abad Tejerina                                               Poniente, 40                              28036       Madrid              91.383.11.45            paloma@abadsaezabogados.com                                             
4               GEMA                   ÁBALOS MUÑOZ                        Gema ábalos Muñoz                             Solarillo de Gracia, 4, 1o.- D                              18002      Granada               639.317.297                          3004@icagr.es

You could then use Pandas to make any further changes to the data structure. Note, the Python dictionary merge operation requires Python 3.9 onwards

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM