I am extracting the data they give repeat name
and surname
in each entry how ever the name and surname is different for each entry these is page link https://www.aeafa.es/asociados.php
import requests
import pandas as pd
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"
}
temp = []
wev={}
for page in range(1, 5):
r = requests.get(
"https://www.aeafa.es/asociados.php?provinput=&_pagi_pg={page}".format(
page=page
),
headers=headers,
)
soup = BeautifulSoup(r.content, "lxml")
details=soup.find('table',class_="table")
for detail in details.find_all('tbody'):
link = [up.text for up in detail.find_all("td")]
name=link[0]
wev['Nombre']=name
surname=link[1]
wev["Apellidos"]=surname
tag = soup.find_all("div", class_="col-md-8 col-sm-8")
for pro in tag:
data = [tup.text for tup in pro.find_all("p")]
Dirección = data[2]
Dirección = Dirección[12:]
wev[" Dirección"]= Dirección
Población = data[3]
Población = Población[14:]
wev[" Población"]= Población
Provincia = data[4]
Provincia = Provincia[14:]
wev["Provincia "]=Provincia
Teléfono = data[5]
Teléfono = "+" + Teléfono[11:].replace(".", "")
Teléfono= Teléfono.replace("-", '')
wev[" Teléfono"]= Teléfono
Email = data[6]
Email = Email[10:]
wev["Email"]= Email
temp.append(wev)
df = pd.DataFrame(temp)
print(df)
They will print same name
and surname
in each entry how I correct it these is output
Nombre Apellidos
0 JUAN ARIAS BARTOLOMÉ
1 JUAN ARIAS BARTOLOM
One approach would be to merge the separate name and surname details into the data from the about information. A test could also be added for when the last page is reached:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from unicodedata import normalize
import re
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"
}
page = 1
data1 = []
data2 = []
while True:
print(f"Page {page}")
r = requests.get(f"https://www.aeafa.es/asociados.php?provinput=&_pagi_pg={page}", headers=headers)
page += 1
soup = BeautifulSoup(r.content, "lxml")
for pro in soup.find_all("div", class_="col-md-8 col-sm-8"):
values = [re.sub(r'\s+', ' ', normalize('NFKD', p.get_text(strip=True))) for p in pro.find_all("p")]
row = {'Sobre' : values[0][6:]} # skip over the word Sobre
for item in values[2:]:
key, value = item.split(':', 1)
row[key.strip()] = value.strip()
row['Teléfono'] = row['Teléfono'].replace(".", "")
data1.append(row)
details = soup.find("table", class_="table").tbody
for tr in details.find_all("tr"):
data2.append([re.sub(r'\s+', ' ', normalize('NFKD', td.get_text(strip=True))) for td in tr.find_all("td")[:-1]])
# Any more?
ul = soup.find("ul", class_="pagination")
last_li = ul.find_all("li")[-1]
if last_li.text != "»":
break
# Merge the name and surname from the second table
data = []
for d1, d2 in zip(data1, data2):
data.append({'Nombre' : d2[0], 'Apellidos' : d2[1]} | d1)
df = pd.DataFrame(data)
print(df)
Giving you a dataframe starting:
Nombre Apellidos Sobre Dirección Población Provincia Teléfono E-mail Web
0 JUAN MARIANO MERCADO Juan Mariano Mercado Juan de Toledo, no 16, 1o B 30800 LORCA Murcia 968-471716 periagomer@hotmail.com
1 Ma. BELEN ABAD GARCIA Ma. Belen Abad Garcia Calle Constantino 33, 1o N 4700 EL EJIDO Almería 950487533 - 647936929 mariabelenabadgarcia@hotmail.com
2 JESÚS ABAD MUÑIZ Jesús Abad Muñiz Santiago, 15, 1o.- ctro. 47001 Valladolid 98.320.20.11 jabad@carlosgallegoabogados.es
3 Ma PALOMA ABAD TEJERINA Ma Paloma Abad Tejerina Poniente, 40 28036 Madrid 91.383.11.45 paloma@abadsaezabogados.com
4 GEMA ÁBALOS MUÑOZ Gema ábalos Muñoz Solarillo de Gracia, 4, 1o.- D 18002 Granada 639.317.297 3004@icagr.es
You could then use Pandas to make any further changes to the data structure. Note, the Python dictionary merge operation requires Python 3.9 onwards
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.