[英]How to extract table from website using python
我一直试图从网站上提取表格,但我迷路了。 谁能帮我? 我的目标是提取 scope 页面的表格: https://training.gov.au/Organisation/Details/31102
import requests
from bs4 import BeautifulSoup
url = "https://training.gov.au/Organisation/Details/31102"
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, 'lxml')
table = soup.find(id ="ScopeQualification")
[row.text.split() for row in table.find_all("tr")]
OrganisationId
。import requests
import json
import pandas as pd
import re
def get_organisationId(url):
# url = 'https://training.gov.au/Organisation/Details/31102'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36'}
resp = requests.get(url, headers=headers)
id_list = re.findall(r'OrganisationId=(.*?)&', resp.text)
organisationId = id_list[0] if id_list else None
return organisationId
# get organisationId first
url = 'https://training.gov.au/Organisation/Details/31102'
organisationId = get_organisationId(url)
def get_AjaxScopeQualification(organisationId):
if organisationId:
url = f'https://training.gov.au/Organisation/AjaxScopeQualification/{organisationId}?tabIndex=4'
headers = {
'origin': 'https://training.gov.au',
'referer': f'https://training.gov.au/Organisation/Details/{organisationId}?tabIndex=4',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36',
'x-requested-with': 'XMLHttpRequest'
}
data = {'page': '1', 'size': '100', 'orderBy': 'Code-asc', 'groupBy': '', 'filter': ''}
r = requests.post(url, json=data, headers=headers)
response = json.loads(re.sub(r'new Date\((\d+),(\d+),(\d+),0,0,0\)', r'"\1-\2-\2"', r.text))
return response
response = get_AjaxScopeQualification(organisationId)
dfn = pd.json_normalize(response, 'data', meta=['total'])
print(dfn.columns)
print(dfn[[ 'Code', 'Title', 'Extent']])
结果:
response['data'][0]
{'Id': '5096634d-4210-4fd4-a51d-f548cd39d57b',
'NrtId': '2feb7e3f-7fc6-4719-ba66-2a066f6635c7',
'RtoId': '3fbfd9c9-3cce-4d69-973e-4e2674f8c5a9',
'TrainingComponentType': 2,
'Code': 'BSB20115',
'Title': 'Certificate II in Business',
'IsImplicit': False,
'ExtentId': '01',
'Extent': 'Deliver and assess',
'StartDate': '2015-3-3',
'EndDate': '2022-3-3',
'DeliveryNsw': True,
'DeliveryVic': True,
'DeliveryQld': True,
'DeliverySa': True,
'DeliveryWa': True,
'DeliveryTas': True,
'DeliveryNt': True,
'DeliveryAct': True,
'ScopeDecisionType': 0,
'ScopeDecision': 'Deliver and assess',
'OverseasCodeAlpha': None,
'OverseasCodeAlhpaList': [],
'OverseasCodeAlphaOutput': ''}
处理 -> https://training.gov.au/Search/SearchOrganisation?Name=&IncludeUnregisteredRtos=false&IncludeNotRtos=false&orgSearchByNameSubmit=Search&AdvancedSearch=&JavaScriptEnabled=true
它是 ajax 链接 -> https://training.gov.au/Search/AjaxGetOrganisations?implicitNrtScope=True&includeUnregisteredRtosForScopeSearch=True&includeUnregisteredRtos=False&includeNotRtos=False&orgSearchByNameSubmit=Search&JavaScriptEnabled=true
使用 ajax 链接和 post 方法获取 json 数据。
更改'size': '200'
以修改响应 output 行。
url = f'https://training.gov.au/Search/AjaxGetOrganisations?implicitNrtScope=True&includeUnregisteredRtosForScopeSearch=True&includeUnregisteredRtos=False&includeNotRtos=False&orgSearchByNameSubmit=Search&JavaScriptEnabled=true'
headers = {
'origin': 'https://training.gov.au',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36',
'x-requested-with': 'XMLHttpRequest'
}
data = {'page': '1', 'size': '200', 'orderBy': 'LegalPersonName-asc', 'groupBy': '', 'filter': ''}
r = requests.post(url, json=data, headers=headers)
response = r.json()
结果
从搜索结果中,您可以得到ea38f597-077e-4c57-b7b6-7ca7dde88399
作为OrganisationId
,不需要使用'Codes': '6639'
来解析https://training.gov.au/Organisation/Details/6639
到获取组织 ID。
'Codes': '6639',
https://training.gov.au/Organisation/Details/6639
https://training.gov.au/Organisation/AjaxScopeSkillSet/ea38f597-077e-4c57-b7b6-7ca7dde88399?includeImplicit=True&tabIndex=4&_=1610518795452
response['data'][0]
{'OrganisationId': 'ea38f597-077e-4c57-b7b6-7ca7dde88399',
'IsRto': True,
'IsTpd': False,
'Codes': '6639',
'LegalPersonName': '1 EDUCATION PTY LTD',
'LegalPersonNameNonCurrent': 'Brad Fenby and Associates Pty Ltd, Franklyn Scholar (Victoria) Pty Ltd',
'TradingNames': [],
'WebAddresses': ['http://www.1education.com.au'],
'GeneralEnquiriesPhone': '0478752453',
'RegistrationStatus': None,
'ValidationType': 0,
'RtoStatus': 0,
'StatusString': 'Current',
'RegistrationManagerId': '12',
'RegistrationStartDate': '/Date(1554037200000+1100)/',
'RegistrationEndDate': '/Date(1774789200000+1100)/',
'CreatedDate': '/Date(1307654398430+1000)/',
'ExternalLinks': {'ExternalLinkType': 2,
'Description': 'MySkillsRto',
'Url': 'http://www.myskills.gov.au/RegisteredTrainers/Details?rtocode={0}'},
'RtoType': '91',
'ActiveScopeAct': True,
'ActiveScopeNsw': True,
'ActiveScopeVic': True,
'ActiveScopeQld': True,
'ActiveScopeSA': True,
'ActiveScopeNT': True,
'ActiveScopeWA': True,
'ActiveScopeTas': True,
'ActiveScopeInt': True,
'RegistrationManagerShortName': 'ASQA',
'StatusSortOrder': '4',
'MySkillsLink': 'http://www.myskills.gov.au/RegisteredTrainers/Details?rtocode=6639'}
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.