简体   繁体   中英

extract data from json using beautifulsoup and python

Im new to web scrapping.I have used beautifulsoup and the code below to get the script containing the json.I am not able to find a method to extract it.

soupMain1 = BeautifulSoup(rMain1.text, 'html.parser')
all_scripts = soupMain1.find('script', text=re.compile("recentHistoryList"))
print all_scripts

the output is as follows:

<script type="text/javascript">
require(['ta/Core/TA.Store'], function(taStore) {
      taStore.store('typeahead.typeahead2_mixed_ui', true);

      taStore.store('typeahead.typeahead2_geo_segmented_ui', true);

taStore.store('typeahead.geoArea', 'Singapore area');     taStore.store('typeahead.worldwide', 'Worldwide');     taStore.store('typeahead.noResultsFound', 'No results found.'); 
      taStore.store('typeahead.flight_enabled', true);

          taStore.store('typeahead.localAirports', []);

          taStore.store('typeahead.recentHistoryList', [{"war_url":"\/UserReview-g294265-d301581-Fairmont_Singapore-Singapore.html","autobroadened":"false","normalized_name":"fairmont singapore","type":"HOTEL","title":"Hotels","is_vr":false,"url":"\/Hotel_Review-g294265-d301581-Reviews-Fairmont_Singapore-Singapore.html","urls":[{"url_type":"hotel","name":"Fairmont Singapore, Singapore, Singapore","type":"HOTEL","url":"\/Hotel_Review-g294265-d301581-Reviews-Fairmont_Singapore-Singapore.html"}],"is_broad":false,"scope":"global","name":"Fairmont Singapore, Singapore, Singapore","data_type":"LOCATION","details":{"parent_name":"Singapore","grandparent_name":"Singapore","highlighted_name":"Fairmont Singapore","name":"Fairmont Singapore","parent_ids":[294265,294262,2,1],"geo_name":"Singapore, Singapore"},"value":301581,"coords":"1.293826,103.85387"},{"lookbackServlet":null,"autobroadened":"false","normalized_name":"singapore","title":"Destinations","type":"GEO","is_vr":true,"url":"\/Tourism-g294265-Singapore-Vacations.html","urls":[{"url_type":"geo","name":"Singapore Tourism","fallback_url":"\/Tourism-g294265-Singapore-Vacations.html","type":"GEO","url":"\/Tourism-g294265-Singapore-Vacations.html"},{"url_type":"vr","name":"Singapore Holiday Homes","fallback_url":"\/VacationRentals-g294265-Reviews-Singapore-Vacation_Rentals.html","type":"VACATION_RENTAL","url":"\/VacationRentals-g294265-Reviews-Singapore-Vacation_Rentals.html"},{"url_type":"eat","name":"Singapore Restaurants","fallback_url":"\/Restaurants-g294265-Singapore.html","type":"EATERY","url":"\/Restaurants-g294265-Singapore.html"},{"url_type":"attr","name":"Singapore Attractions","fallback_url":"\/Attractions-g294265-Activities-Singapore.html","type":"ATTRACTION","url":"\/Attractions-g294265-Activities-Singapore.html"},{"url_type":"hotel","name":"Singapore Hotels","fallback_url":"\/Hotels-g294265-Singapore-Hotels.html","type":"HOTEL","url":"\/Hotels-g294265-Singapore-Hotels.html"},{"url_type":"flights_to","name":"Flights to Singapore","fallback_url":"\/Flights-g294265-Singapore-Cheap_Discount_Airfares.html","type":"FLIGHTS_TO","url":"\/Flights-g294265-Singapore-Cheap_Discount_Airfares.html"},{"url_type":"nbrhd","name":"Singapore Neighbourhoods","fallback_url":"\/NeighborhoodList-g294265-Singapore.html","type":"NEIGHBORHOOD","url":"\/NeighborhoodList-g294265-Singapore.html"},{"url_type":"tg","name":"Singapore Travel Guides","fallback_url":"\/Travel_Guide-g294265-Singapore.html","type":"TRAVEL_GUIDE","url":"\/Travel_Guide-g294265-Singapore.html"}],"is_broad":false,"scope":"global","name":"Singapore, Singapore, Asia","data_type":"LOCATION","details":{"parent_name":"Singapore","grandparent_name":"Asia","rac_enabled":false,"highlighted_name":"Singapore","name":"Singapore","parent_ids":[294262,2,1],"geo_name":"Singapore, Asia"},"value":294265,"coords":"1.285801,103.85111"}]);

    taStore.store('typeahead.restaurant', "Restaurant");         taStore.store('typeahead.attraction', "Attraction");         taStore.store('typeahead.hotel', "Hotel");                       taStore.store('typeahead.restaurant_list', "Restaurants");       taStore.store('typeahead.attraction_list', "Attractions");       taStore.store('typeahead.things_to_do', "Places to Visit");                 taStore.store('typeahead.hotel_list', "Hotels");                 taStore.store('typeahead.flight_list', "Flights");                   taStore.store('typeahead.vacation_rental_list', "Holiday Rentals");     taStore.store('typeahead.scoped.static_local_label', '% area');     taStore.store('typeahead.scoped.result_title_text', 'Start typing, or try one of these suggestions...');     taStore.store('typeahead.scoped.poi_overview_geo', '<span class="poi_overview_item">Overview</span> of %');     taStore.store('typeahead.scoped.poi_hotels_geo', '<span class="poi_overview_item">Hotels</span> in %');     taStore.store('typeahead.scoped.poi_hotels_geo_near', '<span class="poi_overview_item">Hotels</span> near %');     taStore.store('typeahead.scoped.poi_vr_geo', '<span class="poi_overview_item">Holiday Rentals</span> in %');     taStore.store('typeahead.scoped.poi_vr_geo_near', '<span class="poi_overview_item">Holiday Rentals</span> near %');     taStore.store('typeahead.scoped.poi_attractions_geo', '<span class="poi_overview_item">Things to Do</span> in %');     taStore.store('typeahead.scoped.poi_eat_geo', '<span class="poi_overview_item">Restaurants</span> in %');     taStore.store('typeahead.scoped.poi_flights_geo', '<span class="poi_overview_item">Flights</span> to %');     taStore.store('typeahead.scoped.poi_nbrhd_geo', '<span class="poi_overview_item">Neighbourhoods</span> in %');     taStore.store('typeahead.scoped.poi_travel_guides_geo', '<span class="poi_overview_item">Travel Guides</span> in %');     taStore.store('typeahead.scoped.overview', 'Overview ');     taStore.store('typeahead.scoped.neighborhoods', 'Neighbourhoods');     taStore.store('typeahead.scoped.travel_guides', 'Travel Guides');     taStore.store('typeahead.scoped.geo_area_template', '% area');     taStore.store('typeahead.searchMore', 'Find more results for "%"');
taStore.store('typeahead.history', 'Recently viewed');     taStore.store('typeahead.history.all_caps', 'RECENTLY VIEWED');     taStore.store('typeahead.popular_destinations', 'POPULAR DESTINATIONS'); 
  });

I want to get the value of first occurance for key "coords".

import json
from bs4 import BeautifulSoup

html = """
<script type="text/javascript">
require(['ta/Core/TA.Store'], function(taStore) {
      taStore.store('typeahead.typeahead2_mixed_ui', true);

      taStore.store('typeahead.typeahead2_geo_segmented_ui', true);

taStore.store('typeahead.geoArea', 'Singapore area');     taStore.store('typeahead.worldwide', 'Worldwide');     taStore.store('typeahead.noResultsFound', 'No results found.'); 
      taStore.store('typeahead.flight_enabled', true);

          taStore.store('typeahead.localAirports', []);

          taStore.store('typeahead.recentHistoryList', [{"war_url":"\/UserReview-g294265-d301581-Fairmont_Singapore-Singapore.html","autobroadened":"false","normalized_name":"fairmont singapore","type":"HOTEL","title":"Hotels","is_vr":false,"url":"\/Hotel_Review-g294265-d301581-Reviews-Fairmont_Singapore-Singapore.html","urls":[{"url_type":"hotel","name":"Fairmont Singapore, Singapore, Singapore","type":"HOTEL","url":"\/Hotel_Review-g294265-d301581-Reviews-Fairmont_Singapore-Singapore.html"}],"is_broad":false,"scope":"global","name":"Fairmont Singapore, Singapore, Singapore","data_type":"LOCATION","details":{"parent_name":"Singapore","grandparent_name":"Singapore","highlighted_name":"Fairmont Singapore","name":"Fairmont Singapore","parent_ids":[294265,294262,2,1],"geo_name":"Singapore, Singapore"},"value":301581,"coords":"1.293826,103.85387"},{"lookbackServlet":null,"autobroadened":"false","normalized_name":"singapore","title":"Destinations","type":"GEO","is_vr":true,"url":"\/Tourism-g294265-Singapore-Vacations.html","urls":[{"url_type":"geo","name":"Singapore Tourism","fallback_url":"\/Tourism-g294265-Singapore-Vacations.html","type":"GEO","url":"\/Tourism-g294265-Singapore-Vacations.html"},{"url_type":"vr","name":"Singapore Holiday Homes","fallback_url":"\/VacationRentals-g294265-Reviews-Singapore-Vacation_Rentals.html","type":"VACATION_RENTAL","url":"\/VacationRentals-g294265-Reviews-Singapore-Vacation_Rentals.html"},{"url_type":"eat","name":"Singapore Restaurants","fallback_url":"\/Restaurants-g294265-Singapore.html","type":"EATERY","url":"\/Restaurants-g294265-Singapore.html"},{"url_type":"attr","name":"Singapore Attractions","fallback_url":"\/Attractions-g294265-Activities-Singapore.html","type":"ATTRACTION","url":"\/Attractions-g294265-Activities-Singapore.html"},{"url_type":"hotel","name":"Singapore Hotels","fallback_url":"\/Hotels-g294265-Singapore-Hotels.html","type":"HOTEL","url":"\/Hotels-g294265-Singapore-Hotels.html"},{"url_type":"flights_to","name":"Flights to Singapore","fallback_url":"\/Flights-g294265-Singapore-Cheap_Discount_Airfares.html","type":"FLIGHTS_TO","url":"\/Flights-g294265-Singapore-Cheap_Discount_Airfares.html"},{"url_type":"nbrhd","name":"Singapore Neighbourhoods","fallback_url":"\/NeighborhoodList-g294265-Singapore.html","type":"NEIGHBORHOOD","url":"\/NeighborhoodList-g294265-Singapore.html"},{"url_type":"tg","name":"Singapore Travel Guides","fallback_url":"\/Travel_Guide-g294265-Singapore.html","type":"TRAVEL_GUIDE","url":"\/Travel_Guide-g294265-Singapore.html"}],"is_broad":false,"scope":"global","name":"Singapore, Singapore, Asia","data_type":"LOCATION","details":{"parent_name":"Singapore","grandparent_name":"Asia","rac_enabled":false,"highlighted_name":"Singapore","name":"Singapore","parent_ids":[294262,2,1],"geo_name":"Singapore, Asia"},"value":294265,"coords":"1.285801,103.85111"}]);

    taStore.store('typeahead.restaurant', "Restaurant");         taStore.store('typeahead.attraction', "Attraction");         taStore.store('typeahead.hotel', "Hotel");                       taStore.store('typeahead.restaurant_list', "Restaurants");       taStore.store('typeahead.attraction_list', "Attractions");       taStore.store('typeahead.things_to_do', "Places to Visit");                 taStore.store('typeahead.hotel_list', "Hotels");                 taStore.store('typeahead.flight_list', "Flights");                   taStore.store('typeahead.vacation_rental_list', "Holiday Rentals");     taStore.store('typeahead.scoped.static_local_label', '% area');     taStore.store('typeahead.scoped.result_title_text', 'Start typing, or try one of these suggestions...');     taStore.store('typeahead.scoped.poi_overview_geo', '<span class="poi_overview_item">Overview</span> of %');     taStore.store('typeahead.scoped.poi_hotels_geo', '<span class="poi_overview_item">Hotels</span> in %');     taStore.store('typeahead.scoped.poi_hotels_geo_near', '<span class="poi_overview_item">Hotels</span> near %');     taStore.store('typeahead.scoped.poi_vr_geo', '<span class="poi_overview_item">Holiday Rentals</span> in %');     taStore.store('typeahead.scoped.poi_vr_geo_near', '<span class="poi_overview_item">Holiday Rentals</span> near %');     taStore.store('typeahead.scoped.poi_attractions_geo', '<span class="poi_overview_item">Things to Do</span> in %');     taStore.store('typeahead.scoped.poi_eat_geo', '<span class="poi_overview_item">Restaurants</span> in %');     taStore.store('typeahead.scoped.poi_flights_geo', '<span class="poi_overview_item">Flights</span> to %');     taStore.store('typeahead.scoped.poi_nbrhd_geo', '<span class="poi_overview_item">Neighbourhoods</span> in %');     taStore.store('typeahead.scoped.poi_travel_guides_geo', '<span class="poi_overview_item">Travel Guides</span> in %');     taStore.store('typeahead.scoped.overview', 'Overview ');     taStore.store('typeahead.scoped.neighborhoods', 'Neighbourhoods');     taStore.store('typeahead.scoped.travel_guides', 'Travel Guides');     taStore.store('typeahead.scoped.geo_area_template', '% area');     taStore.store('typeahead.searchMore', 'Find more results for "%"');
taStore.store('typeahead.history', 'Recently viewed');     taStore.store('typeahead.history.all_caps', 'RECENTLY VIEWED');     taStore.store('typeahead.popular_destinations', 'POPULAR DESTINATIONS'); 
  });
</script>
"""

soup = BeautifulSoup(html,"html.parser")
# Find the script element contaning th JSON.
anchor = "taStore.store('typeahead.recentHistoryList', "
s = soup.find(lambda tag:tag.name=="script" and anchor in tag.text)
# Extract the JSON.
j = s.text[s.text.find(anchor)+45:s.text.find("}]);")+2]
# Load the JSON.
d = json.loads(j)
# Read the data from the JSON.
print ( d[0]['coords'])

Outputs:

1.293826,103.85387

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM