简体   繁体   中英

How to get javascript dynamic content from website

I'm trying to get dynamic content from website.

I tried to get content with scrapy. But content is loading with js file. So its not entering to text.

Then I installed selenium for this but now I'm getting No such session error.

For example this is page which I'm trying to get content.

http://www.hepsiburada.com/fox-fitness-new-target-70e-2-5-hp-motorlu-masajli-kosu-bandi-hediye-secenekleriyle-p-SPORKONKSBFOX0081?magaza=Finspor

And I just tried to this for this website.

item = ProductItem
        name = response.css('h1.product-name::text').extract_first()
        price = response.css('span[id=offering-price] > span::text').extract_first()
        xpath = response.xpath('/html/head/script[17]')
        data = xpath.re(" = (\{.+\})")
        print(data)

And this is the content which I am trying to get.

 var utagData = {"merchant_names":["Finspor"],"new_site":"new","order_store":"Finspor","order_currency":"TRY","page_domain":"www.hepsiburada.com","page_language":"tr-TR","page_site_name":"Hepsiburada","page_site_region":"tr","site_type":"desktop","page_type":"pdp","page_name":"Product Detail","category_path":"/product/spor-outdoor/spor-fitness/fitness-kondisyon/kosu-bantlari/sporkonksbfox008/","page_title":"Fox Fitness New Target 70E 2.5 Hp Motorlu, Masajlı Koşu Fiyatı","page_url":"http://www.hepsiburada.com/fox-fitness-new-target-70e-2-5-hp-motorlu-masajli-kosu-bandi-hediye-secenekleriyle-p-SPORKONKSBFOX0081?magaza=Finspor","page_referring_url":"http://www.hepsiburada.com/gunun-firsati-teklifi?element=1","page_query_string":["magaza=Finspor"],"is_canonical":"1","canonical_url":"http://www.hepsiburada.com/fox-fitness-new-target-70e-2-5-hp-motorlu-masajli-kosu-bandi-hediye-secenekleriyle-pm-sporkonksbfox008","product_prices":["999.00"],"product_unit_prices":["999.00"],"product_brands":["Fox Fitness"],"product_brand":"Fox Fitness","product_skus":["SPORKONKSBFOX0081"],"product_ids":["sporkonksbfox008"],"product_top_5":["sporkonksbfox008"],"product_names":["Fox Fitness New Target 70E 2.5 Hp Motorlu, Masajlı Koşu Bandı (Hediye Seçenekleriyle)"],"product_category_ids":["19249"],"product_categories":["kosu-bantlari"],"shipping_type":["super-hizli"],"product_quantities":["1"],"product_barcodes":["8691128100776"],"product_barcode":"8691128100776","product_name_array":"Fox Fitness New Target 70E 2.5 Hp Motorlu, Masajlı Koşu Bandı (Hediye Seçenekleriyle)","merchant_ids":["95df0e3483104fc1a16cca6e38bc45cc"],"order_subtotal":["999.00"],"category_id_hierarchy":"60001546 > 2147483635 > 353045 > 19249","category_name_hierarchy":"Spor Outdoor > Spor / Fitness > Fitness - Kondisyon > Koşu Bantları","product_status":"InStock"};
    var utagObject = utagData;
    var utag_data = {"merchant_names":["Finspor"],"new_site":"new","order_store":"Finspor","order_currency":"TRY","page_domain":"www.hepsiburada.com","page_language":"tr-TR","page_site_name":"Hepsiburada","page_site_region":"tr","site_type":"desktop","page_type":"pdp","page_name":"Product Detail","category_path":"/product/spor-outdoor/spor-fitness/fitness-kondisyon/kosu-bantlari/sporkonksbfox008/","page_title":"Fox Fitness New Target 70E 2.5 Hp Motorlu, Masajlı Koşu Fiyatı","page_url":"http://www.hepsiburada.com/fox-fitness-new-target-70e-2-5-hp-motorlu-masajli-kosu-bandi-hediye-secenekleriyle-p-SPORKONKSBFOX0081?magaza=Finspor","page_referring_url":"http://www.hepsiburada.com/gunun-firsati-teklifi?element=1","page_query_string":["magaza=Finspor"],"is_canonical":"1","canonical_url":"http://www.hepsiburada.com/fox-fitness-new-target-70e-2-5-hp-motorlu-masajli-kosu-bandi-hediye-secenekleriyle-pm-sporkonksbfox008","product_prices":["999.00"],"product_unit_prices":["999.00"],"product_brands":["Fox Fitness"],"product_brand":"Fox Fitness","product_skus":["SPORKONKSBFOX0081"],"product_ids":["sporkonksbfox008"],"product_top_5":["sporkonksbfox008"],"product_names":["Fox Fitness New Target 70E 2.5 Hp Motorlu, Masajlı Koşu Bandı (Hediye Seçenekleriyle)"],"product_category_ids":["19249"],"product_categories":["kosu-bantlari"],"shipping_type":["super-hizli"],"product_quantities":["1"],"product_barcodes":["8691128100776"],"product_barcode":"8691128100776","product_name_array":"Fox Fitness New Target 70E 2.5 Hp Motorlu, Masajlı Koşu Bandı (Hediye Seçenekleriyle)","merchant_ids":["95df0e3483104fc1a16cca6e38bc45cc"],"order_subtotal":["999.00"],"category_id_hierarchy":"60001546 > 2147483635 > 353045 > 19249","category_name_hierarchy":"Spor Outdoor > Spor / Fitness > Fitness - Kondisyon > Koşu Bantları","product_status":"InStock"};

There's no need for executing any of the javascript here. If you right click on the page and click "view page source"(or similar) you can find the data right there in json format:

# assuming we're crawling:
# 'http://www.hepsiburada.com/fox-fitness-new-target-70e-2-5-hp-motorlu-masajli-kosu-bandi-hediye-secenekleriyle-p-SPORKONKSBFOX0081?magaza=Finspor'

import json

def parse(self, response):
    # get the java-script in the <script> node
    node = response.xpath("//script[contains(text(),'var utagData = ')]/text()")
    # extract the json bit from the script text with regex 
    data = node.re('= (\{.+\})')[0]
    # convert json to python dictionary
    data = json.loads(data)
    print(data)
    print(data['merchant_names'])
    # gives ['Finspor']

In past I used this library to crawl website and get content I need: https://github.com/lapwinglabs/x-ray

It has good API to find concrete data you need:

//get title
xray('http://google.com', 'title')(function(err, title) {
  console.log(title);
})

or find by finder:

xray('http://reddit.com', '.content')(function(err, innerHTML) {
    console.log(innerHTML);
})

getting concrete attribute value:

xray('http://techcrunch.com', 'img.logo@src')(function(err, value) {
    console.log(value);
})

So please take a look into this library. Maybe it helps you to achieve required results.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM