简体   繁体   中英

How to web scrape tables embedded in websites using Python

This is the website i am trying to scrape: https://clinicaltrials.gov/ct2/results?term=wound+care

Specifically, I would like to get the link of each Study Title, go to that link, then scape the 'Eligibility Criteria' part of that specific page.

This is the code I have been using:

#import statements
import requests # this is used to basically grab the webpage so we can store it as memory
import bs4 # this is used to parse HTML text
import pandas as pd
import numpy as np
import matplotlib as plt

# Static (most reliable)

%matplotlib inline

url = "https://clinicaltrials.gov/ct2/results?term=wound+care"

header = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.2 Safari/605.1.15'}

page = requests.get(url, headers=header)

# This is the return code...tells us if it was successfull
page.reason

page.text

NOW: obviously I can use beautiful soup to parse through that page but that's not my issue. The page isn't grabbing any of the data within the table on that url. I'm not quite sure why... I think it has something to do with requests.get() because thats what actually grabs the web page.

There are two ways to get data. One is to use Selenium. Second is use requests and parse data from json response like in code below. Check comments in the code.

from bs4 import BeautifulSoup
import requests
import json
import re

headers = {
    'Connection': 'keep-alive',
    'Pragma': 'no-cache',
    'Cache-Control': 'no-cache',
    'DNT': '1',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',
    'Sec-Fetch-User': '?1',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
    'Sec-Fetch-Site': 'none',
    'Sec-Fetch-Mode': 'navigate',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'ru,en-US;q=0.9,en;q=0.8,tr;q=0.7',
}

params = (
    ('term', 'wound care'),
)

with requests.session() as s:
    response = s.get('https://clinicaltrials.gov/ct2/results', headers=headers, params=params)
    result_url = re.search(r'"url": "(.*)",', response.text).group(1)
    columns = re.search(r'"columns":\s+(.*),\s+"columnDefs"', response.text, re.DOTALL).group(1)
    columns = json.loads(re.sub(r"//\s+\d+", "", columns))
    # get required columns index from columns json.

    data = {
        'draw': '1',
        'columns[0][data]': '0',
        'columns[0][name]': '',
        'columns[0][searchable]': 'true',
        'columns[0][orderable]': 'false',
        'columns[0][search][value]': '',
        'columns[0][search][regex]': 'false',
        'columns[1][data]': '1',
        'columns[1][name]': '',
        'columns[1][searchable]': 'false',
        'columns[1][orderable]': 'false',
        'columns[1][search][value]': '',
        'columns[1][search][regex]': 'false',
        'columns[2][data]': '2',
        'columns[2][name]': '',
        'columns[2][searchable]': 'true',
        'columns[2][orderable]': 'false',
        'columns[2][search][value]': '',
        'columns[2][search][regex]': 'false',
        'columns[3][data]': '3',
        'columns[3][name]': '',
        'columns[3][searchable]': 'true',
        'columns[3][orderable]': 'false',
        'columns[3][search][value]': '',
        'columns[3][search][regex]': 'false',
        'columns[4][data]': '4',
        'columns[4][name]': '',
        'columns[4][searchable]': 'true',
        'columns[4][orderable]': 'false',
        'columns[4][search][value]': '',
        'columns[4][search][regex]': 'false',
        'columns[5][data]': '5',
        'columns[5][name]': '',
        'columns[5][searchable]': 'true',
        'columns[5][orderable]': 'false',
        'columns[5][search][value]': '',
        'columns[5][search][regex]': 'false',
        'columns[6][data]': '6',
        'columns[6][name]': '',
        'columns[6][searchable]': 'true',
        'columns[6][orderable]': 'false',
        'columns[6][search][value]': '',
        'columns[6][search][regex]': 'false',
        'columns[7][data]': '7',
        'columns[7][name]': '',
        'columns[7][searchable]': 'true',
        'columns[7][orderable]': 'false',
        'columns[7][search][value]': '',
        'columns[7][search][regex]': 'false',
        'columns[8][data]': '8',
        'columns[8][name]': '',
        'columns[8][searchable]': 'true',
        'columns[8][orderable]': 'false',
        'columns[8][search][value]': '',
        'columns[8][search][regex]': 'false',
        'columns[9][data]': '9',
        'columns[9][name]': '',
        'columns[9][searchable]': 'true',
        'columns[9][orderable]': 'false',
        'columns[9][search][value]': '',
        'columns[9][search][regex]': 'false',
        'columns[10][data]': '10',
        'columns[10][name]': '',
        'columns[10][searchable]': 'true',
        'columns[10][orderable]': 'false',
        'columns[10][search][value]': '',
        'columns[10][search][regex]': 'false',
        'columns[11][data]': '11',
        'columns[11][name]': '',
        'columns[11][searchable]': 'true',
        'columns[11][orderable]': 'false',
        'columns[11][search][value]': '',
        'columns[11][search][regex]': 'false',
        'columns[12][data]': '12',
        'columns[12][name]': '',
        'columns[12][searchable]': 'true',
        'columns[12][orderable]': 'false',
        'columns[12][search][value]': '',
        'columns[12][search][regex]': 'false',
        'columns[13][data]': '13',
        'columns[13][name]': '',
        'columns[13][searchable]': 'true',
        'columns[13][orderable]': 'false',
        'columns[13][search][value]': '',
        'columns[13][search][regex]': 'false',
        'columns[14][data]': '14',
        'columns[14][name]': '',
        'columns[14][searchable]': 'true',
        'columns[14][orderable]': 'false',
        'columns[14][search][value]': '',
        'columns[14][search][regex]': 'false',
        'columns[15][data]': '15',
        'columns[15][name]': '',
        'columns[15][searchable]': 'true',
        'columns[15][orderable]': 'false',
        'columns[15][search][value]': '',
        'columns[15][search][regex]': 'false',
        'columns[16][data]': '16',
        'columns[16][name]': '',
        'columns[16][searchable]': 'true',
        'columns[16][orderable]': 'false',
        'columns[16][search][value]': '',
        'columns[16][search][regex]': 'false',
        'columns[17][data]': '17',
        'columns[17][name]': '',
        'columns[17][searchable]': 'true',
        'columns[17][orderable]': 'false',
        'columns[17][search][value]': '',
        'columns[17][search][regex]': 'false',
        'columns[18][data]': '18',
        'columns[18][name]': '',
        'columns[18][searchable]': 'true',
        'columns[18][orderable]': 'false',
        'columns[18][search][value]': '',
        'columns[18][search][regex]': 'false',
        'columns[19][data]': '19',
        'columns[19][name]': '',
        'columns[19][searchable]': 'true',
        'columns[19][orderable]': 'false',
        'columns[19][search][value]': '',
        'columns[19][search][regex]': 'false',
        'columns[20][data]': '20',
        'columns[20][name]': '',
        'columns[20][searchable]': 'true',
        'columns[20][orderable]': 'false',
        'columns[20][search][value]': '',
        'columns[20][search][regex]': 'false',
        'columns[21][data]': '21',
        'columns[21][name]': '',
        'columns[21][searchable]': 'true',
        'columns[21][orderable]': 'false',
        'columns[21][search][value]': '',
        'columns[21][search][regex]': 'false',
        'columns[22][data]': '22',
        'columns[22][name]': '',
        'columns[22][searchable]': 'true',
        'columns[22][orderable]': 'false',
        'columns[22][search][value]': '',
        'columns[22][search][regex]': 'false',
        'columns[23][data]': '23',
        'columns[23][name]': '',
        'columns[23][searchable]': 'true',
        'columns[23][orderable]': 'false',
        'columns[23][search][value]': '',
        'columns[23][search][regex]': 'false',
        'columns[24][data]': '24',
        'columns[24][name]': '',
        'columns[24][searchable]': 'true',
        'columns[24][orderable]': 'false',
        'columns[24][search][value]': '',
        'columns[24][search][regex]': 'false',
        'columns[25][data]': '25',
        'columns[25][name]': '',
        'columns[25][searchable]': 'true',
        'columns[25][orderable]': 'false',
        'columns[25][search][value]': '',
        'columns[25][search][regex]': 'false',
        'start': '0',
        'length': '10',
        'search[value]': '',
        'search[regex]': 'false'
    }
    response = s.post(f'https://clinicaltrials.gov/{result_url}', headers=headers, data=data)
    data = response.json()
    # get total records count from recordsTotal field
    # get required data by index got from columns json

The script is not grabbing any data b'coz the data is not there. It is rendered using JavaScript in your browser. When you are using requests , this does not take place. So, the table content will be missing.

If you look at the response you receive, you can see

<noscript>
    <div style="text-align:center;font-weight:bold;">Please enable JavaScript to see the List of Studies Found</div>
</noscript>

Since you are already familiar with requests I suggest using Requests-html , which is a different library (by the same author) that supports JavaScript rendering.

pip install requests-html

Here is a working example

from requests_html import HTMLSession
from bs4 import BeautifulSoup
session = HTMLSession()
r = session.get('https://clinicaltrials.gov/ct2/results?term=wound+care')
r.html.render()
soup = BeautifulSoup(r.html.html, 'html.parser')
print(soup.find('table', id="theDataTable"))

Sample Output

<table aria-describedby="theDataTable_info" class="stripe row-border compact dataTable no-footer dtr-column" id="theDataTable" role="grid" style="width: 100%; opacity: 1;"><thead><tr role="row"><th class="sorting_disabled dt-body-right" colspan="1" rowspan="1">Row</th><th class="sorting_disabled dt-body-center" colspan="1" rowspan="1">Saved</th><th class="sorting_disabled" colspan="1" rowspan="1">Status</th><th class="sorting_disabled" colspan="1" rowspan="1"><span title="Select the study title to view the study">Study Title</span></th><th class="sorting_disabled" colspan="1" rowspan="1">Conditions</th><th class="sorting_disabled" colspan="1" rowspan="1">Interventions</th><th class="sorting_disabled" colspan="1" rowspan="1">Locations</th></tr></thead> <tbody><tr class="odd parent" role="row"><td class="dt-body-right" tabindex="0"><a id="rowId1"></a>1</td>...

Note: Alternatively you could also take a look at selenium .

The content of the table is downloaded via ajax requests after the page is loaded; I think that the get you have there in the code is only downloading the page while not downloading obviously the ajax data.

Try getting the data from the page using this instead:

from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
myurl = 'https://clinicaltrials.gov/ct2/results?term=wound+care'

#grabbing the page
uClient = uReq(myurl)
#content into variable
page_html = uClient.read()
uClient.close()

#clean up the html for the program to read (parsing)
page_soup = soup(page_html, "html.parser")

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM