简体   繁体   中英

How to extract table from website using python

i have been trying to extract the table from website but i am lost. can anyone help me? my goal is to extract the table of scope page: https://training.gov.au/Organisation/Details/31102

import requests
from bs4 import BeautifulSoup
url = "https://training.gov.au/Organisation/Details/31102"
response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, 'lxml')

table = soup.find(id ="ScopeQualification")
[row.text.split() for row in table.find_all("tr")]
  1. find OrganisationId from 'https://training.gov.au/Organisation/Details/31102'.
  2. find XHR url, https://training.gov.au/Organisation/AjaxScopeQualification/3fbfd9c9-3cce-4d69-973e-4e2674f8c5a9?tabIndex=4 , POST Method.

XHR 链接

import requests
import json
import pandas as pd
import re

def get_organisationId(url):
    # url = 'https://training.gov.au/Organisation/Details/31102'
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36'}
    resp = requests.get(url, headers=headers)
    id_list = re.findall(r'OrganisationId=(.*?)&', resp.text)
    organisationId = id_list[0] if id_list else None
    return organisationId

# get organisationId first
url = 'https://training.gov.au/Organisation/Details/31102'
organisationId = get_organisationId(url)


def get_AjaxScopeQualification(organisationId):
    if organisationId:
        url = f'https://training.gov.au/Organisation/AjaxScopeQualification/{organisationId}?tabIndex=4'
        headers = {
         'origin': 'https://training.gov.au',
         'referer': f'https://training.gov.au/Organisation/Details/{organisationId}?tabIndex=4',
         'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36',
         'x-requested-with': 'XMLHttpRequest'
        }
        data = {'page': '1', 'size': '100', 'orderBy': 'Code-asc', 'groupBy': '', 'filter': ''}
        r = requests.post(url, json=data, headers=headers)
        response = json.loads(re.sub(r'new Date\((\d+),(\d+),(\d+),0,0,0\)', r'"\1-\2-\2"', r.text))
        return response
response = get_AjaxScopeQualification(organisationId)
dfn = pd.json_normalize(response, 'data', meta=['total'])
print(dfn.columns)
print(dfn[[ 'Code', 'Title', 'Extent']])

result:

response['data'][0]

{'Id': '5096634d-4210-4fd4-a51d-f548cd39d57b',
 'NrtId': '2feb7e3f-7fc6-4719-ba66-2a066f6635c7',
 'RtoId': '3fbfd9c9-3cce-4d69-973e-4e2674f8c5a9',
 'TrainingComponentType': 2,
 'Code': 'BSB20115',
 'Title': 'Certificate II in Business',
 'IsImplicit': False,
 'ExtentId': '01',
 'Extent': 'Deliver and assess',
 'StartDate': '2015-3-3',
 'EndDate': '2022-3-3',
 'DeliveryNsw': True,
 'DeliveryVic': True,
 'DeliveryQld': True,
 'DeliverySa': True,
 'DeliveryWa': True,
 'DeliveryTas': True,
 'DeliveryNt': True,
 'DeliveryAct': True,
 'ScopeDecisionType': 0,
 'ScopeDecision': 'Deliver and assess',
 'OverseasCodeAlpha': None,
 'OverseasCodeAlhpaList': [],
 'OverseasCodeAlphaOutput': ''}

To handle -> https://training.gov.au/Search/SearchOrganisation?Name=&IncludeUnregisteredRtos=false&IncludeNotRtos=false&orgSearchByNameSubmit=Search&AdvancedSearch=&JavaScriptEnabled=true

It's ajax link -> https://training.gov.au/Search/AjaxGetOrganisations?implicitNrtScope=True&includeUnregisteredRtosForScopeSearch=True&includeUnregisteredRtos=False&includeNotRtos=False&orgSearchByNameSubmit=Search&JavaScriptEnabled=true

Use ajax link and post method to get the json data.

change 'size': '200' to modify the response output rows.

url = f'https://training.gov.au/Search/AjaxGetOrganisations?implicitNrtScope=True&includeUnregisteredRtosForScopeSearch=True&includeUnregisteredRtos=False&includeNotRtos=False&orgSearchByNameSubmit=Search&JavaScriptEnabled=true'
headers = {
 'origin': 'https://training.gov.au',
 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36',
 'x-requested-with': 'XMLHttpRequest'
}
data = {'page': '1', 'size': '200', 'orderBy': 'LegalPersonName-asc', 'groupBy': '', 'filter': ''}
r = requests.post(url, json=data, headers=headers)
response = r.json()

result

from the Search result, you can get ea38f597-077e-4c57-b7b6-7ca7dde88399 as the OrganisationId , not need to use 'Codes': '6639' to parse https://training.gov.au/Organisation/Details/6639 to get OrganisationId.

'Codes': '6639',
https://training.gov.au/Organisation/Details/6639
https://training.gov.au/Organisation/AjaxScopeSkillSet/ea38f597-077e-4c57-b7b6-7ca7dde88399?includeImplicit=True&tabIndex=4&_=1610518795452
response['data'][0]

{'OrganisationId': 'ea38f597-077e-4c57-b7b6-7ca7dde88399',
 'IsRto': True,
 'IsTpd': False,
 'Codes': '6639',
 'LegalPersonName': '1 EDUCATION PTY LTD',
 'LegalPersonNameNonCurrent': 'Brad Fenby and Associates Pty Ltd, Franklyn Scholar (Victoria) Pty Ltd',
 'TradingNames': [],
 'WebAddresses': ['http://www.1education.com.au'],
 'GeneralEnquiriesPhone': '0478752453',
 'RegistrationStatus': None,
 'ValidationType': 0,
 'RtoStatus': 0,
 'StatusString': 'Current',
 'RegistrationManagerId': '12',
 'RegistrationStartDate': '/Date(1554037200000+1100)/',
 'RegistrationEndDate': '/Date(1774789200000+1100)/',
 'CreatedDate': '/Date(1307654398430+1000)/',
 'ExternalLinks': {'ExternalLinkType': 2,
  'Description': 'MySkillsRto',
  'Url': 'http://www.myskills.gov.au/RegisteredTrainers/Details?rtocode={0}'},
 'RtoType': '91',
 'ActiveScopeAct': True,
 'ActiveScopeNsw': True,
 'ActiveScopeVic': True,
 'ActiveScopeQld': True,
 'ActiveScopeSA': True,
 'ActiveScopeNT': True,
 'ActiveScopeWA': True,
 'ActiveScopeTas': True,
 'ActiveScopeInt': True,
 'RegistrationManagerShortName': 'ASQA',
 'StatusSortOrder': '4',
 'MySkillsLink': 'http://www.myskills.gov.au/RegisteredTrainers/Details?rtocode=6639'}

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM