[英]webscraping through beautifulsoup python
我正在尝试通过一个有很多页面的网站抓取数据,每个页面上有10个列表,并且每个列表页面上都有一个显示联系人号码的超链接-但只有几个初始号码。 单击该超链接后,即可看到整个数字。 我无法找出将这些数字包括在我的数据中的出路。 下面是我的代码:
soup_2 =BeautifulSoup(pages.content, 'html.parser')
con_lin=soup_2.find_all('a', attrs ={'href' :'#'})
Contact_Number =[]
for number in con_lin:
Cont = number.text
Contact_Number.append(Cont)
PS:我正在使用Python3
任何帮助/输入将不胜感激
谢谢
感谢您的答复,我的整个代码是:
import requests
from bs4 import BeautifulSoup
urls = []
for i in range(1,5):
pages = "https://www.realcommercial.com.au/sold/in-luddenham%2c+nsw+2745%3bbadgerys+creek%2c+nsw+2555%3bkemps+creek%2c+nsw+2178%3bmount+vernon%2c+nsw+2178%3bcecil+park%2c+nsw+2178%3bgreendale%2c+nsw+2550%3baustral%2c+nsw+2179%3bwallacia%2c+nsw+2745%3berskine+park%2c+nsw+2759%3bbringelly%2c+nsw+2556%3brossmore%2c+nsw+2557/list-{0}?activeSort=date-desc&autoSuggest=true&includePropertiesWithin=includesurrounding&minFloorArea=10000".format(i)
urls.append(pages)
Data = []
for info in urls:
page = requests.get(info)
soup = BeautifulSoup(page.content, 'html.parser')
links = soup.find_all('a', attrs ={'class' :'details-panel'})
hrefs = [link['href'] for link in links]
for href in hrefs:
entry=[]
pages = requests.get(href)
soup_2 =BeautifulSoup(pages.content, 'html.parser')
Address_1 = soup_2.find_all('p', attrs={'class' :'full-address'})
Address = [Address.text.strip() for Address in Address_1]
Date = soup_2.find_all('li', attrs ={'class' :'sold-date'})
Sold_Date = [Sold_Date.text.strip() for Sold_Date in Date]
Area_1 =soup_2.find_all('ul', attrs={'class' :'summaryList'})
Area_2 = [Area.text.strip() for Area in Area_1]
Land_Area = [x for x in Area_2 if x.startswith('Land Area')]
Floor_Area = [y for y in Area_2 if y.startswith('Floor Area')]
Prop_Type = soup_2.find('div', attrs={'class' :'propTypes ellipsis'}).findChildren()
Property_Type=[]
for span in Prop_Type:
Property_Type+=span
Agency_1=soup_2.find_all('div', attrs={'class' :'agencyName ellipsis'})
Agency_Name=[Agency_Name.text.strip() for Agency_Name in Agency_1]
Agent_1=soup_2.find_all('div', attrs={'class' :'agentName ellipsis'})
Agent_Name=[Agent_Name.text.strip() for Agent_Name in Agent_1]
con_lin=soup_2.find_all('a', attrs ={'href' :'#'})
Contact_Number =[]
for number in con_lin:
Cont = number.text
Contact_Number.append(Cont)
entry.append(Address)
entry.append(Sold_Date)
entry.append(Area)
entry.append(Property_Type)
entry.append(Agency_Name)
entry.append(Agent_Name)
entry.append(Contact_Number)
Data.append(entry)
@Andersson:您建议的修改无效。 我得到如下输出
[[['Kemps Creek, address available on request'],
['Thu 01-Sep-16'],
['Land Area 10.00ha (24.71 acres) (approx)', 'Floor Area 10,000 m²'],
['Land/Development', 'Commercial Farming'],
['CBRE - Western Sydney'],
['Jason Edge'],
['MyCommercial',
'Previous',
'Next',
'Map',
'0410 6...',
' Save Property',
'Get Email Alerts',
'Real Estate Directory']],
[['320 - 340 Badgerys Creek Road, Badgerys Creek, NSW 2555'],
['Mon 22-Apr-13'],
['Land Area 10.00ha (24.71 acres) (approx)', 'Floor Area 10,000 m²'],
['Land/Development', 'Industrial/Warehouse', 'Retail'],
['CBRE - Western Sydney'],
['Frank Oliveri'],
['MyCommercial',
'Previous',
'Next',
'Map',
'+61 41...',
'Street View',
' Save Property',
'Get Email Alerts',
'Real Estate Directory']],
试试下面的代码。 以#
结尾的链接只是一个令人误解的内容,如果您尝试使用该数字进行其他请求,则不会显示该数字。 电话号码基本上存储在data-value
属性中,要获取该信息,您可以像这样进行操作:
import requests
from bs4 import BeautifulSoup
main_link = "https://www.realcommercial.com.au/sold/in-luddenham%2c+nsw+2745%3bbadgerys+creek%2c+nsw+2555%3bkemps+creek%2c+nsw+2178%3bmount+vernon%2c+nsw+2178%3bcecil+park%2c+nsw+2178%3bgreendale%2c+nsw+2550%3baustral%2c+nsw+2179%3bwallacia%2c+nsw+2745%3berskine+park%2c+nsw+2759%3bbringelly%2c+nsw+2556%3brossmore%2c+nsw+2557/list-1?activeSort=date-desc&autoSuggest=true&includePropertiesWithin=includesurrounding&minFloorArea=10000"
def phone_parser(main_link):
soup = BeautifulSoup(requests.get(main_link).text,"lxml")
for titles in soup.select(".listing-card .details-panel"):
target_page(titles['href'])
def target_page(link):
broth = BeautifulSoup(requests.get(link).text,"lxml")
phone = broth.select(".agentPhone [rel='showContactNumber']")[0]['data-value']
# phone = broth.select(".agentPhone [rel='showContactNumber']")[0].get('data-value') #To make it more readable if you like.
print(phone)
phone_parser(main_link)
部分结果:
0410 687 866
+61 419 018 356
0407 506 010
非常感谢Andersson,我已将您的建议实施如下:
import requests
from bs4 import BeautifulSoup
urls = []
for i in range(1,5):
pages = "https://www.realcommercial.com.au/sold/in-luddenham%2c+nsw+2745%3bbadgerys+creek%2c+nsw+2555%3bkemps+creek%2c+nsw+2178%3bmount+vernon%2c+nsw+2178%3bcecil+park%2c+nsw+2178%3bgreendale%2c+nsw+2550%3baustral%2c+nsw+2179%3bwallacia%2c+nsw+2745%3berskine+park%2c+nsw+2759%3bbringelly%2c+nsw+2556%3brossmore%2c+nsw+2557/list-{0}?activeSort=date-desc&autoSuggest=true&includePropertiesWithin=includesurrounding&minFloorArea=10000".format(i)
urls.append(pages)
Data = []
for info in urls:
page = requests.get(info)
soup = BeautifulSoup(page.content, 'html.parser')
links = soup.find_all('a', attrs ={'class' :'details-panel'})
hrefs = [link['href'] for link in links]
for href in hrefs:
entry=[]
pages = requests.get(href)
soup_2 =BeautifulSoup(pages.content, 'html.parser')
Address_1 = soup_2.find_all('p', attrs={'class' :'full-address'})
Address = [Address.text.strip() for Address in Address_1]
Date = soup_2.find_all('li', attrs ={'class' :'sold-date'})
Sold_Date = [Sold_Date.text.strip() for Sold_Date in Date]
Area_1 =soup_2.find_all('ul', attrs={'class' :'summaryList'})
Area_2 = [Area.text.strip() for Area in Area_1]
Land_Area = [x for x in Area_2 if x.startswith('Land Area')]
Floor_Area = [y for y in Area_2 if y.startswith('Floor Area')]
Prop_Type = soup_2.find('div', attrs={'class' :'propTypes ellipsis'}).findChildren()
Property_Type=[]
for span in Prop_Type:
Property_Type+=span
Agency_1=soup_2.find_all('div', attrs={'class' :'agencyName ellipsis'})
Agency_Name=[Agency_Name.text.strip() for Agency_Name in Agency_1]
Agent_1=soup_2.find_all('div', attrs={'class' :'agentName ellipsis'})
Agent_Name=[Agent_Name.text.strip() for Agent_Name in Agent_1]
Contact = soup_2.select(".agentPhone [rel='showContactNumber']")[0]['data-value']
Contacts =[]
Contacts.append(Contact)
entry.append(Address)
entry.append(Sold_Date)
entry.append(Land_Area)
entry.append(Floor_Area)
entry.append(Property_Type)
entry.append(Agency_Name)
entry.append(Agent_Name)
entry.append(Contacts)
Data.append(entry)
非常感谢你的帮助 !!!
@Shahin非常感谢您的帮助。 请帮助我在代码中使用.agentCont以及在agentCon和agentPhone之间留出空间的任何特定原因。 ?
我已将您的建议实施如下,并且运行良好:
import requests
from bs4 import BeautifulSoup
urls = []
for i in range(1,5):
pages = "https://www.realcommercial.com.au/sold/in-luddenham%2c+nsw+2745%3bbadgerys+creek%2c+nsw+2555%3bkemps+creek%2c+nsw+2178%3bmount+vernon%2c+nsw+2178%3bcecil+park%2c+nsw+2178%3bgreendale%2c+nsw+2550%3baustral%2c+nsw+2179%3bwallacia%2c+nsw+2745%3berskine+park%2c+nsw+2759%3bbringelly%2c+nsw+2556%3brossmore%2c+nsw+2557/list-{0}?activeSort=date-desc&autoSuggest=true&includePropertiesWithin=includesurrounding&minFloorArea=10000".format(i)
urls.append(pages)
Data = []
for info in urls:
page = requests.get(info)
soup = BeautifulSoup(page.content, 'html.parser')
links = soup.find_all('a', attrs ={'class' :'details-panel'})
hrefs = [link['href'] for link in links]
for href in hrefs:
entry=[]
pages = requests.get(href)
soup_2 =BeautifulSoup(pages.content, 'html.parser')
Address_1 = soup_2.find_all('p', attrs={'class' :'full-address'})
Address = [Address.text.strip() for Address in Address_1]
Date = soup_2.find_all('li', attrs ={'class' :'sold-date'})
Sold_Date = [Sold_Date.text.strip() for Sold_Date in Date]
Area_1 =soup_2.find_all('ul', attrs={'class' :'summaryList'})
Area_2 = [Area.text.strip() for Area in Area_1]
Land_Area = [x for x in Area_2 if x.startswith('Land Area')]
Floor_Area = [y for y in Area_2 if y.startswith('Floor Area')]
Prop_Type = soup_2.find('div', attrs={'class' :'propTypes ellipsis'}).findChildren()
Property_Type=[]
for span in Prop_Type:
Property_Type+=span
Agency_1=soup_2.find_all('div', attrs={'class' :'agencyName ellipsis'})
Agency_Name=[Agency_Name.text.strip() for Agency_Name in Agency_1]
Agent_1=soup_2.find_all('div', attrs={'class' :'agentName ellipsis'})
Agent_Name=[Agent_Name.text.strip() for Agent_Name in Agent_1]
phone = [broth['data-value'] for broth in soup_2.select(".agentCont .agentPhone [rel='showContactNumber']")]
Contacts =[]
Contacts.append(phone)
entry.append(Address)
entry.append(Sold_Date)
entry.append(Land_Area)
entry.append(Floor_Area)
entry.append(Property_Type)
entry.append(Agency_Name)
entry.append(Agent_Name)
entry.append(Contacts)
Data.append(entry)
非常感谢!!
我采用了三个不同的链接。 第一个包含一个代理的编号,第二个包含两个代理的编号,最后一个显然包含三个代理的编号。 要一次处理所有这些,请参见以下脚本:
import requests
from bs4 import BeautifulSoup
main_links = (
"https://www.realcommercial.com.au/property-land+development-nsw-badgerys+creek-500502195",
"https://www.realcommercial.com.au/property-land+development-nsw-austral-500468083",
"https://www.realcommercial.com.au/property-industrial+warehouse-nsw-minchinbury-502343342"
)
def phone_parser(link):
soup = BeautifulSoup(requests.get(link).text,"lxml")
phone = [broth['data-value'] for broth in soup.select(".agentCont .agentPhone [rel='showContactNumber']")]
print(' '.join(phone))
if __name__ == '__main__':
for url in main_links:
phone_parser(url)
结果:
+61 419 018 356
0412 549 766 0407 506 010
+61 414 836 817 +61 401 146 051 +61 412 992 830
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.