簡體   English   中英

從中提取多行javascript內容<script> tag using Scrapy

[英]Extract multi-line javascript content from <script> tag using Scrapy

我正在嘗試使用Scrapy從此腳本標記中提取數據:

<script>
        var hardwareTemplateFunctions;
        var storefrontContextUrl = '';

        jq(function() {
            var data = new Object();
            data.hardwareProductCode = '9054832';
            data.offeringCode = 'SMART_BASIC.TLF12PLEAS';
            data.defaultTab = '';
            data.categoryId = 10001;

            data.bundles = new Object();
                            data.bundles['SMART_SUPERX.TLF12PLEAS'] = {
                    signupFee: parsePrice('0'),
                    newMsisdnFee: parsePrice('199'),
                    upfrontPrice: parsePrice('1099'),
                    monthlyPrice: parsePrice('499'),
                    commitmentTime: parsePrice('12'),
                    offeringTitle: 'SMART Super',
                    offeringType: 'VOICE',
                    monthlyPrice: parsePrice('499'),
                    commitmentTime: 12
                };
                            data.bundles['SMART_PLUSS.TLF12PLEAS'] = {
                    signupFee: parsePrice('0'),
                    newMsisdnFee: parsePrice('199'),
                    upfrontPrice: parsePrice('1599'),
                    monthlyPrice: parsePrice('399'),
                    commitmentTime: parsePrice('12'),
                    offeringTitle: 'SMART Pluss',
                    offeringType: 'VOICE',
                    monthlyPrice: parsePrice('399'),
                    commitmentTime: 12
                };
                            data.bundles['SMART_BASIC.TLF12PLEAS'] = {
                    signupFee: parsePrice('0'),
                    newMsisdnFee: parsePrice('199'),
                    upfrontPrice: parsePrice('2199'),
                    monthlyPrice: parsePrice('299'),
                    commitmentTime: parsePrice('12'),
                    offeringTitle: 'SMART Basis',
                    offeringType: 'VOICE',
                    monthlyPrice: parsePrice('299'),
                    commitmentTime: 12
                };
                            data.bundles['SMART_MINI.TLF12PLEAS'] = {
                    signupFee: parsePrice('0'),
                    newMsisdnFee: parsePrice('199'),
                    upfrontPrice: parsePrice('2999'),
                    monthlyPrice: parsePrice('199'),
                    commitmentTime: parsePrice('12'),
                    offeringTitle: 'SMART Mini',
                    offeringType: 'VOICE',
                    monthlyPrice: parsePrice('199'),
                    commitmentTime: 12
                };
                            data.bundles['KONTANT_KOMPLETT.REGULAR'] = {
                    signupFee: parsePrice('0'),
                    newMsisdnFee: parsePrice('0'),
                    upfrontPrice: parsePrice('3499'),
                    monthlyPrice: parsePrice('0'),
                    commitmentTime: parsePrice('0'),
                    offeringTitle: 'SMART Kontant',
                    offeringType: 'PREPAID',
                    monthlyPrice: parsePrice('0'),
                    commitmentTime: 0
                };

            data.reviewJson = new Object();


            hardwareTemplateFunctions = hardwareTemplateFunctions(data);
            hardwareTemplateFunctions.init();

            data.reviewSummaryBox = hardwareTemplateFunctions.reviewSummaryBox;

            accessoryFunctions(data).init();
            additionalServiceFunctions(data).init();
        });

        function parsePrice(str) {
            var price = parseFloat(str);
            return isNaN(price) ? 0 : price;
        }

        var offerings = {};
    </script>

我想從每個部分獲得如下所示的數據:

 data.bundles['SMART_SUPERX.TLF12PLEAS'] = {
                signupFee: parsePrice('0'),
                newMsisdnFee: parsePrice('199'),
                upfrontPrice: parsePrice('1099'),
                monthlyPrice: parsePrice('499'),
                commitmentTime: parsePrice('12'),
                offeringTitle: 'SMART Super',
                offeringType: 'VOICE',
                monthlyPrice: parsePrice('499'),
                commitmentTime: 12
            };

然后從每個字段中獲取數據並從例如upfrontPrice獲取最終數據(例如,在此示例中為1099)。

我嘗試使用以下方法獲取每個對象:

items = response.xpath('//script/text()').re("data.bundles\[.*\](.*)")

然而,這只給我第一行數據。 = { )。 那我該怎么做呢? 有沒有更好的方法從腳本標記中提取此數據?

編輯:當我使用items = response.xpath('//script/text()').re("data.bundles\\[.*\\] = {((?s).*) };")我似乎只得到最后一個塊(帶有data.bundles['KONTANT_KOMPLETT.REGULAR']

我如何獲得所有這些的列表?

如果您不想使用正則表達式,那么有js2xml ,它解析Javascript代碼並將其轉換為lxml文檔。 然后,您可以使用XPath從Javascript語句中查詢事物。 (免責聲明:我編寫並維護js2xml)

以下是有關如何獲取這些data.bundles分配的示例代碼:

import scrapy

selector = scrapy.Selector(text="""<script>
        var hardwareTemplateFunctions;
        var storefrontContextUrl = '';

        jq(function() {
            var data = new Object();
            data.hardwareProductCode = '9054832';
            data.offeringCode = 'SMART_BASIC.TLF12PLEAS';
            data.defaultTab = '';
            data.categoryId = 10001;

            data.bundles = new Object();
                            data.bundles['SMART_SUPERX.TLF12PLEAS'] = {
                    signupFee: parsePrice('0'),
                    newMsisdnFee: parsePrice('199'),
                    upfrontPrice: parsePrice('1099'),
                    monthlyPrice: parsePrice('499'),
                    commitmentTime: parsePrice('12'),
                    offeringTitle: 'SMART Super',
                    offeringType: 'VOICE',
                    monthlyPrice: parsePrice('499'),
                    commitmentTime: 12
                };
                            data.bundles['SMART_PLUSS.TLF12PLEAS'] = {
                    signupFee: parsePrice('0'),
                    newMsisdnFee: parsePrice('199'),
                    upfrontPrice: parsePrice('1599'),
                    monthlyPrice: parsePrice('399'),
                    commitmentTime: parsePrice('12'),
                    offeringTitle: 'SMART Pluss',
                    offeringType: 'VOICE',
                    monthlyPrice: parsePrice('399'),
                    commitmentTime: 12
                };
                            data.bundles['SMART_BASIC.TLF12PLEAS'] = {
                    signupFee: parsePrice('0'),
                    newMsisdnFee: parsePrice('199'),
                    upfrontPrice: parsePrice('2199'),
                    monthlyPrice: parsePrice('299'),
                    commitmentTime: parsePrice('12'),
                    offeringTitle: 'SMART Basis',
                    offeringType: 'VOICE',
                    monthlyPrice: parsePrice('299'),
                    commitmentTime: 12
                };
                            data.bundles['SMART_MINI.TLF12PLEAS'] = {
                    signupFee: parsePrice('0'),
                    newMsisdnFee: parsePrice('199'),
                    upfrontPrice: parsePrice('2999'),
                    monthlyPrice: parsePrice('199'),
                    commitmentTime: parsePrice('12'),
                    offeringTitle: 'SMART Mini',
                    offeringType: 'VOICE',
                    monthlyPrice: parsePrice('199'),
                    commitmentTime: 12
                };
                            data.bundles['KONTANT_KOMPLETT.REGULAR'] = {
                    signupFee: parsePrice('0'),
                    newMsisdnFee: parsePrice('0'),
                    upfrontPrice: parsePrice('3499'),
                    monthlyPrice: parsePrice('0'),
                    commitmentTime: parsePrice('0'),
                    offeringTitle: 'SMART Kontant',
                    offeringType: 'PREPAID',
                    monthlyPrice: parsePrice('0'),
                    commitmentTime: 0
                };

            data.reviewJson = new Object();


            hardwareTemplateFunctions = hardwareTemplateFunctions(data);
            hardwareTemplateFunctions.init();

            data.reviewSummaryBox = hardwareTemplateFunctions.reviewSummaryBox;

            accessoryFunctions(data).init();
            additionalServiceFunctions(data).init();
        });

        function parsePrice(str) {
            var price = parseFloat(str);
            return isNaN(price) ? 0 : price;
        }

        var offerings = {};
    </script>""")

(第一部分是在Scrapy選擇器中獲取HTML輸入)

import js2xml
import pprint

data_bundles = {}
for script in selector.xpath('//script/text()').extract():
    # this is how you turn Javascript code into an XML document (lxml document in fact)
    jstree = js2xml.parse(script)

    # then, we're interested in assignments of data.bundles object
    for a in jstree.xpath('//assign[left//property/identifier/@name="bundles" and right/object]'):
        # the assigned property is give by a <string> property from a <bracketaccessor>
        bundle_prop = a.xpath('./left/bracketaccessor/property/string/text()')
        if bundle_prop is not None:
            curr_prop = bundle_prop[0]

        data_bundles[curr_prop] = {}

        # the left object is assigned an object (inside a <right> element)
        # let's loop on the <property> elements)
        # the values are either numbers or string arguments of a function call
        for prop in a.xpath('./right/object/property'):
            data_bundles[curr_prop][prop.xpath('@name')[0]] = prop.xpath('.//number/@value | .//string/text()')[0]

pprint.pprint(data_bundles)

這就是你從中得到的:

{'KONTANT_KOMPLETT.REGULAR': {'commitmentTime': '0',
                              'monthlyPrice': '0',
                              'newMsisdnFee': '0',
                              'offeringTitle': 'SMART Kontant',
                              'offeringType': 'PREPAID',
                              'signupFee': '0',
                              'upfrontPrice': '3499'},
 'SMART_BASIC.TLF12PLEAS': {'commitmentTime': '12',
                            'monthlyPrice': '299',
                            'newMsisdnFee': '199',
                            'offeringTitle': 'SMART Basis',
                            'offeringType': 'VOICE',
                            'signupFee': '0',
                            'upfrontPrice': '2199'},
 'SMART_MINI.TLF12PLEAS': {'commitmentTime': '12',
                           'monthlyPrice': '199',
                           'newMsisdnFee': '199',
                           'offeringTitle': 'SMART Mini',
                           'offeringType': 'VOICE',
                           'signupFee': '0',
                           'upfrontPrice': '2999'},
 'SMART_PLUSS.TLF12PLEAS': {'commitmentTime': '12',
                            'monthlyPrice': '399',
                            'newMsisdnFee': '199',
                            'offeringTitle': 'SMART Pluss',
                            'offeringType': 'VOICE',
                            'signupFee': '0',
                            'upfrontPrice': '1599'},
 'SMART_SUPERX.TLF12PLEAS': {'commitmentTime': '12',
                             'monthlyPrice': '499',
                             'newMsisdnFee': '199',
                             'offeringTitle': 'SMART Super',
                             'offeringType': 'VOICE',
                             'signupFee': '0',
                             'upfrontPrice': '1099'}}

有關使用js2xml.parse()獲得的XML模式的更多信息,可以查看https://github.com/redapple/js2xml/blob/master/SCHEMA.rst

正則表達式似乎是正確的:

r"data\.bundles\[[^\]]*\] = {([^}]*)}"

*在正則表達式中是貪婪的 - 它總是盡可能地匹配,所以我使用[^\\]]來確保我將匹配最接近的] 我用{}括號做同樣的事情。 另外,我不必擔心. 不匹配換行符。

這個腳本需要安裝Mozilla Firefoxpython-selenium ,我也使用一個名為script.txt的文件進行測試,該文件包含由標簽包圍的腳本。 這是代碼:

from selenium import webdriver

script_content = open("script.txt").read()

#Removing script tags
exec_script = script_content.replace("<script>", "").replace("</script>", "")

#Removing jq function call
exec_script = exec_script.replace("jq(function() {", "").replace("});", "")

#Setting some helper functions to avoid javascript errors
helper_functions = """function hardwareTemplateFunctions(){
                     return {init: function(){}};};  
                     accessoryFunctions = additionalServiceFunctions = 
                     hardwareTemplateFunctions;"""

#Returning data variable
return_statement = "return data;"

wd = webdriver.Firefox()

#Getting data variable in result
result = wd.execute_script(helper_functions + exec_script +  return_statement)

結果變量如下所示:

{u'bundles': {u'KONTANT_KOMPLETT.REGULAR': {u'commitmentTime': 0,
   u'monthlyPrice': 0,
   u'newMsisdnFee': 0,
   u'offeringTitle': u'SMART Kontant',
   u'offeringType': u'PREPAID',
   u'signupFee': 0,
   u'upfrontPrice': 3499},
  u'SMART_BASIC.TLF12PLEAS': {u'commitmentTime': 12,
   u'monthlyPrice': 299,
   u'newMsisdnFee': 199,
   u'offeringTitle': u'SMART Basis',
   u'offeringType': u'VOICE',
   u'signupFee': 0,
   u'upfrontPrice': 2199},
  u'SMART_MINI.TLF12PLEAS': {u'commitmentTime': 12,
   u'monthlyPrice': 199,
   u'newMsisdnFee': 199,
   u'offeringTitle': u'SMART Mini',
   u'offeringType': u'VOICE',
   u'signupFee': 0,
   u'upfrontPrice': 2999},
  u'SMART_PLUSS.TLF12PLEAS': {u'commitmentTime': 12,
   u'monthlyPrice': 399,
   u'newMsisdnFee': 199,
   u'offeringTitle': u'SMART Pluss',
   u'offeringType': u'VOICE',
   u'signupFee': 0,
   u'upfrontPrice': 1599},
  u'SMART_SUPERX.TLF12PLEAS': {u'commitmentTime': 12,
   u'monthlyPrice': 499,
   u'newMsisdnFee': 199,
   u'offeringTitle': u'SMART Super',
   u'offeringType': u'VOICE',
   u'signupFee': 0,
   u'upfrontPrice': 1099}},
 u'categoryId': 10001,
 u'defaultTab': u'',
 u'hardwareProductCode': u'9054832',
 u'offeringCode': u'SMART_BASIC.TLF12PLEAS',
 u'reviewJson': {},
 u'reviewSummaryBox': None}

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM