Parse HTML and extract data from a tag's string (in JSON format)

Question

From a Html file a tag was extracted

from bs4 import BeautifulSoup
html=open("centrimo.html")
parsed_html=BeautifulSoup(html)
script_data=parsed_cen.script

Now from the string contained in the script tag I would like to extract the information in the variables "sequences", "neg_sequences", "seqs" and "nseqs".

    <script type="text/javascript">
  //@JSON_VAR data
  var data = {
    "version": "5.3.3",
    "revision": "1667d7719daf2af1693cc039ba463bc4d2304d23",
    "release": "Sun Feb 7 15:39:52 2021 -0800",
    "program": "CentriMo",
    "options": {
      "cd": false,
      "neg_sequences": true,
      "noseq": false,
      "mcc": false
    },
    "seqlen": 101,
    "tested": 435,
    "alphabet": {
      "name": "DNA",
      "like": "dna",
      "ncore": 4
    },
    "background": [0.2788, 0.2212, 0.2212, 0.2788],
    "sequences": [
      "AT1G04100.1_CDS", "AT1G05860.1_CDS", "AT1G13910.1_CDS",
      "AT1G21065.1_CDS", "AT1G26190.1_CDS", "AT1G32940.1_CDS",
      "AT1G50575.1_CDS", "AT1G55810.1_CDS", "AT1G66430.1_CDS",
      "AT1G71430.1_CDS", "AT1G77170.1_CDS", "AT1G78610.1_CDS",
      "AT2G02955.1_CDS", "AT2G16280.1_CDS", "AT2G17080.1_CDS",
      "AT2G19620.1_CDS", "AT2G19640.1_CDS", "AT2G30840.1_CDS",
      "AT2G39450.1_CDS", "AT2G41380.1_CDS", "AT2G42580.1_CDS",
      "AT3G01680.1_CDS", "AT3G05680.1_CDS", "AT3G20110.1_CDS",
      "AT3G20260.1_CDS", "AT3G21360.1_CDS", "AT3G23070.1_CDS",
      "AT3G23590.1_CDS", "AT3G46820.1_CDS", "AT3G48250.1_CDS",
      "AT3G61200.1_CDS", "AT4G08510.1_CDS", "AT4G15070.1_CDS",
      "AT4G24670.1_CDS", "AT4G25450.1_CDS", "AT4G28600.1_CDS",
      "AT4G31910.1_CDS", "AT4G34810.1_CDS", "AT4G35030.3_CDS",
      "AT4G37170.1_CDS", "AT4G38630.1_CDS", "AT4G39720.1_CDS",
      "AT5G07340.1_CDS", "AT5G12970.1_CDS", "AT5G13470.1_CDS",
      "AT5G18950.1_CDS", "AT5G22840.1_CDS", "AT5G25590.1_CDS",
      "AT5G27395.1_CDS", "AT5G53370.1_CDS", "AT5G63610.1_CDS",
      "AT5G64830.1_CDS", "AT5G64900.1_CDS", "AT5G67620.1_CDS"
    ],
    "neg_sequences": [
      "AT1G01600.1_CDS", "AT2G32480.1_CDS", "AT2G41740.1_CDS",
      "AT3G19490.1_CDS", "AT3G24030.1_CDS", "AT3G25580.1_CDS",
      "AT3G48330.1_CDS", "AT3G59220.1_CDS", "AT4G13340.1_CDS",
      "AT4G33590.1_CDS", "AT5G03080.1_CDS", "AT5G23700.1_CDS",
      "AT5G41010.1_CDS"
    ],
    "motifs": [
      {
        "db": 2,
        "id": "ath-miR419",
        "alt": "MIMAT0001327",
        "consensus": "CAACATCCTCAGCATTCATAA",
        "len": 21,
        "motif_evalue": "0.0e+000",
        "motif_nsites": 20,
        "n_tested": 40,
        "score_threshold": 5,
        "url": "http://www.mirbase.org/cgi-bin/mature.pl?mature_acc=MIMAT0001327",
        "pwm": [
          [0.164036, 0.479478, 0.163749, 0.192738],  
          [0.479764, 0.163749, 0.192452, 0.164036]
        ],
        "total_sites": 10,
        "sites": [
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
        ],
        "neg_total_sites": 2,
        "neg_sites": [
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
        ],
        "seqs": [0, 1, 13, 15, 16, 23, 27, 36, 44, 48],
        "neg_seqs": [3, 10],
        "peaks": [
          {
            "center": 0,
            "fisher_log_adj_pvalue": 0
          }
        ]
      }
    ]
  };
</script>

I tried to convert the object into a json type object but I got the following error,

import json
j_script = json.loads(script_data.string)

Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/usr/lib/python3.7/json/__init__.py", line 348, in loads
    return _default_decoder.decode(s)
  File "/usr/lib/python3.7/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/usr/lib/python3.7/json/decoder.py", line 355, in raw_decode
    raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 2 column 7 (char 7)

Thanks in advance

PS. an example of a complete html file that I would like to parse can be found ( here )

Edit: In the original post I mentioned that I got an indentation error. That happened after trying to manually edit the json object by removing all the white spaces, "\\n" characters. Although I don't think it fundamentally changes the question, I apologize for the mistake

[UPDATE] I was able to adapt the answer in this post as follows

tmp=script_data.string.partition('=')
j_tmp=tmp[2].replace(";\n    ","")  
j_script=json.loads(j_tmp)

The second line is a bit clumsy (I couldn't adapt the answer in this other post ) but overall it does the trick. Now I'm trying to obtain the 'seqs' data which is contained in the "motifs" list.

Help with the second line in the code above will be much appreciated

Answer 1

Below is the best answer that I could come up with.

from bs4 import BeautifulSoup
import json
html=open("<HTML File>")
parsed_html=BeautifulSoup(html)
script_data=parsed_html.script
tmp=script_data.string.partition('=')
 
#Replace the last part of the tupple (tmp)
j_tmp=tmp[2].replace(";\n    ","")  
 
j_script=json.loads(j_tmp)
 
seq_id=j_script["sequences"]
 
#Loop over the dictionaries looking for "seqs"
for idict in j_script["motifs"]:
    if "seqs" in idict:
        dict_tmp=idict
      
seq_n=dict_tmp["seqs"]

The code still needs improvement but I think it can do the job now.

Parse HTML and extract data from a tag's string (in JSON format)

Question

1 answers

solution1
0 2021-07-15 15:22:45

Parse HTML and extract data from a tag's string (in JSON format)

Question

1 answers

solution1 0 2021-07-15 15:22:45

solution1
0 2021-07-15 15:22:45