Python Web Scraper IEEE

Question

I am trying to retrieve keywords of a particular IEEE document. I came across this code here

        ieee_content = requests.get(link, timeout=180)
        soup = BeautifulSoup(ieee_content.text, 'lxml')
        tag = soup.find_all('script')
        #metadata = "".join(re.findall('global.document.metadata=(.*)', tag[9].text)).replace(";", '').replace('global.document.metadata=', '')
        for i in tag[9]:
            metadata_format = re.compile(r'global.document.metadata=.*', re.MULTILINE)
            metadata = re.findall(metadata_format, i)
            if len(metadata) != 0:
               # convert the list 
               convert_to_json = json.dumps(metadata)
               x = json.loads(convert_to_json)
               s = x[0].replace("'", '"').replace(";", '')

The problem is that my metadata variable is always empty. I tried to iterate across all tags rather than using tag[9], but metadata is still empty in all cases. I tried using 'xml' instead of 'lmxl' as well but the result is the same. I'd appreciate some help with this.

Answer 1

import json
import re
from pprint import pprint

import requests
from bs4 import BeautifulSoup

ieee_content = requests.get("https://ieeexplore.ieee.org/document/7845555", timeout=180)
soup = BeautifulSoup(ieee_content.content, "html.parser")
scripts = soup.find_all("script")

pattern = re.compile(r"(?<=\"keywords\":)\[{.*?}\]")
keywords_dict = {}
for i, script in enumerate(scripts):
    keywords = re.findall(pattern, str(script.string))
    if len(keywords) == 1:
        raw_keywords_list = json.loads(keywords[0])
        for keyword_type in raw_keywords_list:
            keywords_dict[keyword_type["type"].strip()] = [kwd.strip() for kwd in keyword_type["kwd"]]

pprint(keywords_dict)

Python Web Scraper IEEE

Question

1 answers

solution1
0 ACCPTED 2021-08-17 07:22:00

Python Web Scraper IEEE

Question

1 answers

solution1 0 ACCPTED 2021-08-17 07:22:00

solution1
0 ACCPTED 2021-08-17 07:22:00