簡體   English   中英

使用beautifulsoup和python從json提取數據

[英]extract data from json using beautifulsoup and python

我是Web爬蟲的新手,我使用了beautifulsoup和下面的代碼來獲取包含json的腳本,但我找不到提取它的方法。

soupMain1 = BeautifulSoup(rMain1.text, 'html.parser')
all_scripts = soupMain1.find('script', text=re.compile("recentHistoryList"))
print all_scripts

輸出如下:

<script type="text/javascript">
require(['ta/Core/TA.Store'], function(taStore) {
      taStore.store('typeahead.typeahead2_mixed_ui', true);

      taStore.store('typeahead.typeahead2_geo_segmented_ui', true);

taStore.store('typeahead.geoArea', 'Singapore area');     taStore.store('typeahead.worldwide', 'Worldwide');     taStore.store('typeahead.noResultsFound', 'No results found.'); 
      taStore.store('typeahead.flight_enabled', true);

          taStore.store('typeahead.localAirports', []);

          taStore.store('typeahead.recentHistoryList', [{"war_url":"\/UserReview-g294265-d301581-Fairmont_Singapore-Singapore.html","autobroadened":"false","normalized_name":"fairmont singapore","type":"HOTEL","title":"Hotels","is_vr":false,"url":"\/Hotel_Review-g294265-d301581-Reviews-Fairmont_Singapore-Singapore.html","urls":[{"url_type":"hotel","name":"Fairmont Singapore, Singapore, Singapore","type":"HOTEL","url":"\/Hotel_Review-g294265-d301581-Reviews-Fairmont_Singapore-Singapore.html"}],"is_broad":false,"scope":"global","name":"Fairmont Singapore, Singapore, Singapore","data_type":"LOCATION","details":{"parent_name":"Singapore","grandparent_name":"Singapore","highlighted_name":"Fairmont Singapore","name":"Fairmont Singapore","parent_ids":[294265,294262,2,1],"geo_name":"Singapore, Singapore"},"value":301581,"coords":"1.293826,103.85387"},{"lookbackServlet":null,"autobroadened":"false","normalized_name":"singapore","title":"Destinations","type":"GEO","is_vr":true,"url":"\/Tourism-g294265-Singapore-Vacations.html","urls":[{"url_type":"geo","name":"Singapore Tourism","fallback_url":"\/Tourism-g294265-Singapore-Vacations.html","type":"GEO","url":"\/Tourism-g294265-Singapore-Vacations.html"},{"url_type":"vr","name":"Singapore Holiday Homes","fallback_url":"\/VacationRentals-g294265-Reviews-Singapore-Vacation_Rentals.html","type":"VACATION_RENTAL","url":"\/VacationRentals-g294265-Reviews-Singapore-Vacation_Rentals.html"},{"url_type":"eat","name":"Singapore Restaurants","fallback_url":"\/Restaurants-g294265-Singapore.html","type":"EATERY","url":"\/Restaurants-g294265-Singapore.html"},{"url_type":"attr","name":"Singapore Attractions","fallback_url":"\/Attractions-g294265-Activities-Singapore.html","type":"ATTRACTION","url":"\/Attractions-g294265-Activities-Singapore.html"},{"url_type":"hotel","name":"Singapore Hotels","fallback_url":"\/Hotels-g294265-Singapore-Hotels.html","type":"HOTEL","url":"\/Hotels-g294265-Singapore-Hotels.html"},{"url_type":"flights_to","name":"Flights to Singapore","fallback_url":"\/Flights-g294265-Singapore-Cheap_Discount_Airfares.html","type":"FLIGHTS_TO","url":"\/Flights-g294265-Singapore-Cheap_Discount_Airfares.html"},{"url_type":"nbrhd","name":"Singapore Neighbourhoods","fallback_url":"\/NeighborhoodList-g294265-Singapore.html","type":"NEIGHBORHOOD","url":"\/NeighborhoodList-g294265-Singapore.html"},{"url_type":"tg","name":"Singapore Travel Guides","fallback_url":"\/Travel_Guide-g294265-Singapore.html","type":"TRAVEL_GUIDE","url":"\/Travel_Guide-g294265-Singapore.html"}],"is_broad":false,"scope":"global","name":"Singapore, Singapore, Asia","data_type":"LOCATION","details":{"parent_name":"Singapore","grandparent_name":"Asia","rac_enabled":false,"highlighted_name":"Singapore","name":"Singapore","parent_ids":[294262,2,1],"geo_name":"Singapore, Asia"},"value":294265,"coords":"1.285801,103.85111"}]);

    taStore.store('typeahead.restaurant', "Restaurant");         taStore.store('typeahead.attraction', "Attraction");         taStore.store('typeahead.hotel', "Hotel");                       taStore.store('typeahead.restaurant_list', "Restaurants");       taStore.store('typeahead.attraction_list', "Attractions");       taStore.store('typeahead.things_to_do', "Places to Visit");                 taStore.store('typeahead.hotel_list', "Hotels");                 taStore.store('typeahead.flight_list', "Flights");                   taStore.store('typeahead.vacation_rental_list', "Holiday Rentals");     taStore.store('typeahead.scoped.static_local_label', '% area');     taStore.store('typeahead.scoped.result_title_text', 'Start typing, or try one of these suggestions...');     taStore.store('typeahead.scoped.poi_overview_geo', '<span class="poi_overview_item">Overview</span> of %');     taStore.store('typeahead.scoped.poi_hotels_geo', '<span class="poi_overview_item">Hotels</span> in %');     taStore.store('typeahead.scoped.poi_hotels_geo_near', '<span class="poi_overview_item">Hotels</span> near %');     taStore.store('typeahead.scoped.poi_vr_geo', '<span class="poi_overview_item">Holiday Rentals</span> in %');     taStore.store('typeahead.scoped.poi_vr_geo_near', '<span class="poi_overview_item">Holiday Rentals</span> near %');     taStore.store('typeahead.scoped.poi_attractions_geo', '<span class="poi_overview_item">Things to Do</span> in %');     taStore.store('typeahead.scoped.poi_eat_geo', '<span class="poi_overview_item">Restaurants</span> in %');     taStore.store('typeahead.scoped.poi_flights_geo', '<span class="poi_overview_item">Flights</span> to %');     taStore.store('typeahead.scoped.poi_nbrhd_geo', '<span class="poi_overview_item">Neighbourhoods</span> in %');     taStore.store('typeahead.scoped.poi_travel_guides_geo', '<span class="poi_overview_item">Travel Guides</span> in %');     taStore.store('typeahead.scoped.overview', 'Overview ');     taStore.store('typeahead.scoped.neighborhoods', 'Neighbourhoods');     taStore.store('typeahead.scoped.travel_guides', 'Travel Guides');     taStore.store('typeahead.scoped.geo_area_template', '% area');     taStore.store('typeahead.searchMore', 'Find more results for "%"');
taStore.store('typeahead.history', 'Recently viewed');     taStore.store('typeahead.history.all_caps', 'RECENTLY VIEWED');     taStore.store('typeahead.popular_destinations', 'POPULAR DESTINATIONS'); 
  });

我想獲取關鍵“坐標”首次出現的值。

import json
from bs4 import BeautifulSoup

html = """
<script type="text/javascript">
require(['ta/Core/TA.Store'], function(taStore) {
      taStore.store('typeahead.typeahead2_mixed_ui', true);

      taStore.store('typeahead.typeahead2_geo_segmented_ui', true);

taStore.store('typeahead.geoArea', 'Singapore area');     taStore.store('typeahead.worldwide', 'Worldwide');     taStore.store('typeahead.noResultsFound', 'No results found.'); 
      taStore.store('typeahead.flight_enabled', true);

          taStore.store('typeahead.localAirports', []);

          taStore.store('typeahead.recentHistoryList', [{"war_url":"\/UserReview-g294265-d301581-Fairmont_Singapore-Singapore.html","autobroadened":"false","normalized_name":"fairmont singapore","type":"HOTEL","title":"Hotels","is_vr":false,"url":"\/Hotel_Review-g294265-d301581-Reviews-Fairmont_Singapore-Singapore.html","urls":[{"url_type":"hotel","name":"Fairmont Singapore, Singapore, Singapore","type":"HOTEL","url":"\/Hotel_Review-g294265-d301581-Reviews-Fairmont_Singapore-Singapore.html"}],"is_broad":false,"scope":"global","name":"Fairmont Singapore, Singapore, Singapore","data_type":"LOCATION","details":{"parent_name":"Singapore","grandparent_name":"Singapore","highlighted_name":"Fairmont Singapore","name":"Fairmont Singapore","parent_ids":[294265,294262,2,1],"geo_name":"Singapore, Singapore"},"value":301581,"coords":"1.293826,103.85387"},{"lookbackServlet":null,"autobroadened":"false","normalized_name":"singapore","title":"Destinations","type":"GEO","is_vr":true,"url":"\/Tourism-g294265-Singapore-Vacations.html","urls":[{"url_type":"geo","name":"Singapore Tourism","fallback_url":"\/Tourism-g294265-Singapore-Vacations.html","type":"GEO","url":"\/Tourism-g294265-Singapore-Vacations.html"},{"url_type":"vr","name":"Singapore Holiday Homes","fallback_url":"\/VacationRentals-g294265-Reviews-Singapore-Vacation_Rentals.html","type":"VACATION_RENTAL","url":"\/VacationRentals-g294265-Reviews-Singapore-Vacation_Rentals.html"},{"url_type":"eat","name":"Singapore Restaurants","fallback_url":"\/Restaurants-g294265-Singapore.html","type":"EATERY","url":"\/Restaurants-g294265-Singapore.html"},{"url_type":"attr","name":"Singapore Attractions","fallback_url":"\/Attractions-g294265-Activities-Singapore.html","type":"ATTRACTION","url":"\/Attractions-g294265-Activities-Singapore.html"},{"url_type":"hotel","name":"Singapore Hotels","fallback_url":"\/Hotels-g294265-Singapore-Hotels.html","type":"HOTEL","url":"\/Hotels-g294265-Singapore-Hotels.html"},{"url_type":"flights_to","name":"Flights to Singapore","fallback_url":"\/Flights-g294265-Singapore-Cheap_Discount_Airfares.html","type":"FLIGHTS_TO","url":"\/Flights-g294265-Singapore-Cheap_Discount_Airfares.html"},{"url_type":"nbrhd","name":"Singapore Neighbourhoods","fallback_url":"\/NeighborhoodList-g294265-Singapore.html","type":"NEIGHBORHOOD","url":"\/NeighborhoodList-g294265-Singapore.html"},{"url_type":"tg","name":"Singapore Travel Guides","fallback_url":"\/Travel_Guide-g294265-Singapore.html","type":"TRAVEL_GUIDE","url":"\/Travel_Guide-g294265-Singapore.html"}],"is_broad":false,"scope":"global","name":"Singapore, Singapore, Asia","data_type":"LOCATION","details":{"parent_name":"Singapore","grandparent_name":"Asia","rac_enabled":false,"highlighted_name":"Singapore","name":"Singapore","parent_ids":[294262,2,1],"geo_name":"Singapore, Asia"},"value":294265,"coords":"1.285801,103.85111"}]);

    taStore.store('typeahead.restaurant', "Restaurant");         taStore.store('typeahead.attraction', "Attraction");         taStore.store('typeahead.hotel', "Hotel");                       taStore.store('typeahead.restaurant_list', "Restaurants");       taStore.store('typeahead.attraction_list', "Attractions");       taStore.store('typeahead.things_to_do', "Places to Visit");                 taStore.store('typeahead.hotel_list', "Hotels");                 taStore.store('typeahead.flight_list', "Flights");                   taStore.store('typeahead.vacation_rental_list', "Holiday Rentals");     taStore.store('typeahead.scoped.static_local_label', '% area');     taStore.store('typeahead.scoped.result_title_text', 'Start typing, or try one of these suggestions...');     taStore.store('typeahead.scoped.poi_overview_geo', '<span class="poi_overview_item">Overview</span> of %');     taStore.store('typeahead.scoped.poi_hotels_geo', '<span class="poi_overview_item">Hotels</span> in %');     taStore.store('typeahead.scoped.poi_hotels_geo_near', '<span class="poi_overview_item">Hotels</span> near %');     taStore.store('typeahead.scoped.poi_vr_geo', '<span class="poi_overview_item">Holiday Rentals</span> in %');     taStore.store('typeahead.scoped.poi_vr_geo_near', '<span class="poi_overview_item">Holiday Rentals</span> near %');     taStore.store('typeahead.scoped.poi_attractions_geo', '<span class="poi_overview_item">Things to Do</span> in %');     taStore.store('typeahead.scoped.poi_eat_geo', '<span class="poi_overview_item">Restaurants</span> in %');     taStore.store('typeahead.scoped.poi_flights_geo', '<span class="poi_overview_item">Flights</span> to %');     taStore.store('typeahead.scoped.poi_nbrhd_geo', '<span class="poi_overview_item">Neighbourhoods</span> in %');     taStore.store('typeahead.scoped.poi_travel_guides_geo', '<span class="poi_overview_item">Travel Guides</span> in %');     taStore.store('typeahead.scoped.overview', 'Overview ');     taStore.store('typeahead.scoped.neighborhoods', 'Neighbourhoods');     taStore.store('typeahead.scoped.travel_guides', 'Travel Guides');     taStore.store('typeahead.scoped.geo_area_template', '% area');     taStore.store('typeahead.searchMore', 'Find more results for "%"');
taStore.store('typeahead.history', 'Recently viewed');     taStore.store('typeahead.history.all_caps', 'RECENTLY VIEWED');     taStore.store('typeahead.popular_destinations', 'POPULAR DESTINATIONS'); 
  });
</script>
"""

soup = BeautifulSoup(html,"html.parser")
# Find the script element contaning th JSON.
anchor = "taStore.store('typeahead.recentHistoryList', "
s = soup.find(lambda tag:tag.name=="script" and anchor in tag.text)
# Extract the JSON.
j = s.text[s.text.find(anchor)+45:s.text.find("}]);")+2]
# Load the JSON.
d = json.loads(j)
# Read the data from the JSON.
print ( d[0]['coords'])

輸出:

1.293826,103.85387

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM