Extract multi-line javascript content from [removed] tag using Scrapy

前端 未结 3 1721
借酒劲吻你
借酒劲吻你 2021-01-13 15:37

I\'m trying to extract data from this script tag using Scrapy:



        
相关标签:
3条回答
  • 2021-01-13 15:45

    If you don't want to play around with regular expressions, there's js2xml, which parses Javascript code and converts it to an lxml document. Then you can use XPath to query things from the Javascript statements. (disclaimer: I wrote and maintain js2xml)

    Here's sample code on how to get those data.bundles assignments:

    import scrapy
    
    selector = scrapy.Selector(text="""<script>
            var hardwareTemplateFunctions;
            var storefrontContextUrl = '';
    
            jq(function() {
                var data = new Object();
                data.hardwareProductCode = '9054832';
                data.offeringCode = 'SMART_BASIC.TLF12PLEAS';
                data.defaultTab = '';
                data.categoryId = 10001;
    
                data.bundles = new Object();
                                data.bundles['SMART_SUPERX.TLF12PLEAS'] = {
                        signupFee: parsePrice('0'),
                        newMsisdnFee: parsePrice('199'),
                        upfrontPrice: parsePrice('1099'),
                        monthlyPrice: parsePrice('499'),
                        commitmentTime: parsePrice('12'),
                        offeringTitle: 'SMART Super',
                        offeringType: 'VOICE',
                        monthlyPrice: parsePrice('499'),
                        commitmentTime: 12
                    };
                                data.bundles['SMART_PLUSS.TLF12PLEAS'] = {
                        signupFee: parsePrice('0'),
                        newMsisdnFee: parsePrice('199'),
                        upfrontPrice: parsePrice('1599'),
                        monthlyPrice: parsePrice('399'),
                        commitmentTime: parsePrice('12'),
                        offeringTitle: 'SMART Pluss',
                        offeringType: 'VOICE',
                        monthlyPrice: parsePrice('399'),
                        commitmentTime: 12
                    };
                                data.bundles['SMART_BASIC.TLF12PLEAS'] = {
                        signupFee: parsePrice('0'),
                        newMsisdnFee: parsePrice('199'),
                        upfrontPrice: parsePrice('2199'),
                        monthlyPrice: parsePrice('299'),
                        commitmentTime: parsePrice('12'),
                        offeringTitle: 'SMART Basis',
                        offeringType: 'VOICE',
                        monthlyPrice: parsePrice('299'),
                        commitmentTime: 12
                    };
                                data.bundles['SMART_MINI.TLF12PLEAS'] = {
                        signupFee: parsePrice('0'),
                        newMsisdnFee: parsePrice('199'),
                        upfrontPrice: parsePrice('2999'),
                        monthlyPrice: parsePrice('199'),
                        commitmentTime: parsePrice('12'),
                        offeringTitle: 'SMART Mini',
                        offeringType: 'VOICE',
                        monthlyPrice: parsePrice('199'),
                        commitmentTime: 12
                    };
                                data.bundles['KONTANT_KOMPLETT.REGULAR'] = {
                        signupFee: parsePrice('0'),
                        newMsisdnFee: parsePrice('0'),
                        upfrontPrice: parsePrice('3499'),
                        monthlyPrice: parsePrice('0'),
                        commitmentTime: parsePrice('0'),
                        offeringTitle: 'SMART Kontant',
                        offeringType: 'PREPAID',
                        monthlyPrice: parsePrice('0'),
                        commitmentTime: 0
                    };
    
                data.reviewJson = new Object();
    
    
                hardwareTemplateFunctions = hardwareTemplateFunctions(data);
                hardwareTemplateFunctions.init();
    
                data.reviewSummaryBox = hardwareTemplateFunctions.reviewSummaryBox;
    
                accessoryFunctions(data).init();
                additionalServiceFunctions(data).init();
            });
    
            function parsePrice(str) {
                var price = parseFloat(str);
                return isNaN(price) ? 0 : price;
            }
    
            var offerings = {};
        </script>""")
    

    (this first part is to get the HTML input in a Scrapy Selector)

    import js2xml
    import pprint
    
    data_bundles = {}
    for script in selector.xpath('//script/text()').extract():
        # this is how you turn Javascript code into an XML document (lxml document in fact)
        jstree = js2xml.parse(script)
    
        # then, we're interested in assignments of data.bundles object
        for a in jstree.xpath('//assign[left//property/identifier/@name="bundles" and right/object]'):
            # the assigned property is give by a <string> property from a <bracketaccessor>
            bundle_prop = a.xpath('./left/bracketaccessor/property/string/text()')
            if bundle_prop is not None:
                curr_prop = bundle_prop[0]
    
            data_bundles[curr_prop] = {}
    
            # the left object is assigned an object (inside a <right> element)
            # let's loop on the <property> elements)
            # the values are either numbers or string arguments of a function call
            for prop in a.xpath('./right/object/property'):
                data_bundles[curr_prop][prop.xpath('@name')[0]] = prop.xpath('.//number/@value | .//string/text()')[0]
    
    pprint.pprint(data_bundles)
    

    This is what you get out of this:

    {'KONTANT_KOMPLETT.REGULAR': {'commitmentTime': '0',
                                  'monthlyPrice': '0',
                                  'newMsisdnFee': '0',
                                  'offeringTitle': 'SMART Kontant',
                                  'offeringType': 'PREPAID',
                                  'signupFee': '0',
                                  'upfrontPrice': '3499'},
     'SMART_BASIC.TLF12PLEAS': {'commitmentTime': '12',
                                'monthlyPrice': '299',
                                'newMsisdnFee': '199',
                                'offeringTitle': 'SMART Basis',
                                'offeringType': 'VOICE',
                                'signupFee': '0',
                                'upfrontPrice': '2199'},
     'SMART_MINI.TLF12PLEAS': {'commitmentTime': '12',
                               'monthlyPrice': '199',
                               'newMsisdnFee': '199',
                               'offeringTitle': 'SMART Mini',
                               'offeringType': 'VOICE',
                               'signupFee': '0',
                               'upfrontPrice': '2999'},
     'SMART_PLUSS.TLF12PLEAS': {'commitmentTime': '12',
                                'monthlyPrice': '399',
                                'newMsisdnFee': '199',
                                'offeringTitle': 'SMART Pluss',
                                'offeringType': 'VOICE',
                                'signupFee': '0',
                                'upfrontPrice': '1599'},
     'SMART_SUPERX.TLF12PLEAS': {'commitmentTime': '12',
                                 'monthlyPrice': '499',
                                 'newMsisdnFee': '199',
                                 'offeringTitle': 'SMART Super',
                                 'offeringType': 'VOICE',
                                 'signupFee': '0',
                                 'upfrontPrice': '1099'}}
    

    For more info on the XML schema you get with js2xml.parse(), you can check https://github.com/redapple/js2xml/blob/master/SCHEMA.rst

    0 讨论(0)
  • 2021-01-13 15:55

    Following regex seems to be correct:

    r"data\.bundles\[[^\]]*\] = {([^}]*)}"
    

    * in regexes is greedy - it will always try to match as much as possible, so i use [^\]] to make sure that I will match the closest ]. I do the same with {} brackets. Additionally, I don't have to worry about . not matching newline.

    0 讨论(0)
  • 2021-01-13 16:00

    This script requires Mozilla Firefox and python-selenium installed, also I made the tests using a file called script.txt, which contains the script surrounded by tag. Here's the code:

    from selenium import webdriver
    
    script_content = open("script.txt").read()
    
    #Removing script tags
    exec_script = script_content.replace("<script>", "").replace("</script>", "")
    
    #Removing jq function call
    exec_script = exec_script.replace("jq(function() {", "").replace("});", "")
    
    #Setting some helper functions to avoid javascript errors
    helper_functions = """function hardwareTemplateFunctions(){
                         return {init: function(){}};};  
                         accessoryFunctions = additionalServiceFunctions = 
                         hardwareTemplateFunctions;"""
    
    #Returning data variable
    return_statement = "return data;"
    
    wd = webdriver.Firefox()
    
    #Getting data variable in result
    result = wd.execute_script(helper_functions + exec_script +  return_statement)
    

    The result variable looks like this:

    {u'bundles': {u'KONTANT_KOMPLETT.REGULAR': {u'commitmentTime': 0,
       u'monthlyPrice': 0,
       u'newMsisdnFee': 0,
       u'offeringTitle': u'SMART Kontant',
       u'offeringType': u'PREPAID',
       u'signupFee': 0,
       u'upfrontPrice': 3499},
      u'SMART_BASIC.TLF12PLEAS': {u'commitmentTime': 12,
       u'monthlyPrice': 299,
       u'newMsisdnFee': 199,
       u'offeringTitle': u'SMART Basis',
       u'offeringType': u'VOICE',
       u'signupFee': 0,
       u'upfrontPrice': 2199},
      u'SMART_MINI.TLF12PLEAS': {u'commitmentTime': 12,
       u'monthlyPrice': 199,
       u'newMsisdnFee': 199,
       u'offeringTitle': u'SMART Mini',
       u'offeringType': u'VOICE',
       u'signupFee': 0,
       u'upfrontPrice': 2999},
      u'SMART_PLUSS.TLF12PLEAS': {u'commitmentTime': 12,
       u'monthlyPrice': 399,
       u'newMsisdnFee': 199,
       u'offeringTitle': u'SMART Pluss',
       u'offeringType': u'VOICE',
       u'signupFee': 0,
       u'upfrontPrice': 1599},
      u'SMART_SUPERX.TLF12PLEAS': {u'commitmentTime': 12,
       u'monthlyPrice': 499,
       u'newMsisdnFee': 199,
       u'offeringTitle': u'SMART Super',
       u'offeringType': u'VOICE',
       u'signupFee': 0,
       u'upfrontPrice': 1099}},
     u'categoryId': 10001,
     u'defaultTab': u'',
     u'hardwareProductCode': u'9054832',
     u'offeringCode': u'SMART_BASIC.TLF12PLEAS',
     u'reviewJson': {},
     u'reviewSummaryBox': None}
    
    0 讨论(0)
提交回复
热议问题