i have been trying to extract the table from website but i am lost. can anyone help me? my goal is to extract the table of scope page: https://training.gov.au/Organisation/Details/31102
import requests
from bs4 import BeautifulSoup
url = "https://training.gov.au/Organisation/Details/31102"
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, 'lxml')
table = soup.find(id ="ScopeQualification")
[row.text.split() for row in table.find_all("tr")]
OrganisationId
from 'https://training.gov.au/Organisation/Details/31102'.import requests
import json
import pandas as pd
import re
def get_organisationId(url):
# url = 'https://training.gov.au/Organisation/Details/31102'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36'}
resp = requests.get(url, headers=headers)
id_list = re.findall(r'OrganisationId=(.*?)&', resp.text)
organisationId = id_list[0] if id_list else None
return organisationId
# get organisationId first
url = 'https://training.gov.au/Organisation/Details/31102'
organisationId = get_organisationId(url)
def get_AjaxScopeQualification(organisationId):
if organisationId:
url = f'https://training.gov.au/Organisation/AjaxScopeQualification/{organisationId}?tabIndex=4'
headers = {
'origin': 'https://training.gov.au',
'referer': f'https://training.gov.au/Organisation/Details/{organisationId}?tabIndex=4',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36',
'x-requested-with': 'XMLHttpRequest'
}
data = {'page': '1', 'size': '100', 'orderBy': 'Code-asc', 'groupBy': '', 'filter': ''}
r = requests.post(url, json=data, headers=headers)
response = json.loads(re.sub(r'new Date\((\d+),(\d+),(\d+),0,0,0\)', r'"\1-\2-\2"', r.text))
return response
response = get_AjaxScopeQualification(organisationId)
dfn = pd.json_normalize(response, 'data', meta=['total'])
print(dfn.columns)
print(dfn[[ 'Code', 'Title', 'Extent']])
result:
response['data'][0]
{'Id': '5096634d-4210-4fd4-a51d-f548cd39d57b',
'NrtId': '2feb7e3f-7fc6-4719-ba66-2a066f6635c7',
'RtoId': '3fbfd9c9-3cce-4d69-973e-4e2674f8c5a9',
'TrainingComponentType': 2,
'Code': 'BSB20115',
'Title': 'Certificate II in Business',
'IsImplicit': False,
'ExtentId': '01',
'Extent': 'Deliver and assess',
'StartDate': '2015-3-3',
'EndDate': '2022-3-3',
'DeliveryNsw': True,
'DeliveryVic': True,
'DeliveryQld': True,
'DeliverySa': True,
'DeliveryWa': True,
'DeliveryTas': True,
'DeliveryNt': True,
'DeliveryAct': True,
'ScopeDecisionType': 0,
'ScopeDecision': 'Deliver and assess',
'OverseasCodeAlpha': None,
'OverseasCodeAlhpaList': [],
'OverseasCodeAlphaOutput': ''}
To handle -> https://training.gov.au/Search/SearchOrganisation?Name=&IncludeUnregisteredRtos=false&IncludeNotRtos=false&orgSearchByNameSubmit=Search&AdvancedSearch=&JavaScriptEnabled=true
It's ajax link -> https://training.gov.au/Search/AjaxGetOrganisations?implicitNrtScope=True&includeUnregisteredRtosForScopeSearch=True&includeUnregisteredRtos=False&includeNotRtos=False&orgSearchByNameSubmit=Search&JavaScriptEnabled=true
Use ajax link and post method to get the json data.
change 'size': '200'
to modify the response output rows.
url = f'https://training.gov.au/Search/AjaxGetOrganisations?implicitNrtScope=True&includeUnregisteredRtosForScopeSearch=True&includeUnregisteredRtos=False&includeNotRtos=False&orgSearchByNameSubmit=Search&JavaScriptEnabled=true'
headers = {
'origin': 'https://training.gov.au',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36',
'x-requested-with': 'XMLHttpRequest'
}
data = {'page': '1', 'size': '200', 'orderBy': 'LegalPersonName-asc', 'groupBy': '', 'filter': ''}
r = requests.post(url, json=data, headers=headers)
response = r.json()
result
from the Search result, you can get ea38f597-077e-4c57-b7b6-7ca7dde88399
as the OrganisationId
, not need to use 'Codes': '6639'
to parse https://training.gov.au/Organisation/Details/6639
to get OrganisationId.
'Codes': '6639',
https://training.gov.au/Organisation/Details/6639
https://training.gov.au/Organisation/AjaxScopeSkillSet/ea38f597-077e-4c57-b7b6-7ca7dde88399?includeImplicit=True&tabIndex=4&_=1610518795452
response['data'][0]
{'OrganisationId': 'ea38f597-077e-4c57-b7b6-7ca7dde88399',
'IsRto': True,
'IsTpd': False,
'Codes': '6639',
'LegalPersonName': '1 EDUCATION PTY LTD',
'LegalPersonNameNonCurrent': 'Brad Fenby and Associates Pty Ltd, Franklyn Scholar (Victoria) Pty Ltd',
'TradingNames': [],
'WebAddresses': ['http://www.1education.com.au'],
'GeneralEnquiriesPhone': '0478752453',
'RegistrationStatus': None,
'ValidationType': 0,
'RtoStatus': 0,
'StatusString': 'Current',
'RegistrationManagerId': '12',
'RegistrationStartDate': '/Date(1554037200000+1100)/',
'RegistrationEndDate': '/Date(1774789200000+1100)/',
'CreatedDate': '/Date(1307654398430+1000)/',
'ExternalLinks': {'ExternalLinkType': 2,
'Description': 'MySkillsRto',
'Url': 'http://www.myskills.gov.au/RegisteredTrainers/Details?rtocode={0}'},
'RtoType': '91',
'ActiveScopeAct': True,
'ActiveScopeNsw': True,
'ActiveScopeVic': True,
'ActiveScopeQld': True,
'ActiveScopeSA': True,
'ActiveScopeNT': True,
'ActiveScopeWA': True,
'ActiveScopeTas': True,
'ActiveScopeInt': True,
'RegistrationManagerShortName': 'ASQA',
'StatusSortOrder': '4',
'MySkillsLink': 'http://www.myskills.gov.au/RegisteredTrainers/Details?rtocode=6639'}
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.