[英]How to parse data inside a string?
我有这样的数据:
'[ {
"agent_id": 558921,
"agent_name": "The City Townhouse",
"attributes": {
"agent_ratings_enabled": 0,
"approved": 1,
"attribute_set_id": 1,
"categories": JSON.parse("[15,19]"),
"indoor_features": ["Balcony","Maid\\\'s room"],
"is_agent": 1} ];'
我不知道如何解析它们。 它只是字符串的一部分。
我的代码在这里:
import requests
import re
res = requests.get('https://www.lamudi.com.ph/brandnew-modern-duplex-2-car-garage-house-and-lot-for-sale-in-betterliving-md.html')
res = res.content
RE_OFFER_DATA_JSON_FROM_FLAT_PAGE = r".dataLayer = (.+?)<\/script>"
OFFER_DATA_JSON = re.compile(RE_OFFER_DATA_JSON_FROM_FLAT_PAGE)
data = OFFER_DATA_JSON.search(str(res)).group(1)
text = data.replace("\\n", "")
print(text)
我知道,使用正则表达式不是一个好主意,但我没有其他方法。
感谢您的任何建议
要从该站点加载 JSON 数据,需要先从 Javascript 中清除它:
import re
import json
import requests
res = requests.get(
"https://www.lamudi.com.ph/brandnew-modern-duplex-2-car-garage-house-and-lot-for-sale-in-betterliving-md.html"
)
data = re.search(r"dataLayer = (\[.*?\]);", res.text, flags=re.S).group(1)
# clean the data from Javascript artifacts:
data = re.sub(r'JSON.parse\("(.*?)"\)', r"\1", data)
data = re.sub(r",\s*}", r"}", data)
data = re.sub(r"navigator\.userAgent", r'""', data)
# parse the data
data = json.loads(data)
# pretty print the data:
print(json.dumps(data, indent=4))
印刷:
[
{
"agent_id": 558921,
"agent_name": "The City Townhouse",
"attributes": {
"agent_ratings_enabled": 0,
"approved": 1,
"attribute_set_id": 1,
"categories": [
15,
19
],
"indoor_features": [
"Balcony",
"Maid's room"
],
"is_agent": 1,
"listing_type": "Classifieds",
"other_features": [],
"outdoor_features": [
"Garage"
],
"price_formatted": "\u20b1 11,300,000 ",
"price_not_shown": false,
"seller_is_trusted": 1,
"show_listing_address": 1,
"show_mobile": 1,
"is_whatsapp": 1,
"is_viber": 0,
"is_facebook": 0,
"show_officephone": 0,
"top_position": 0,
"urlkey_details": "brandnew-modern-duplex-2-car-garage-house-and-lot-for-sale-in-betterliving-md.html",
"bathrooms": 3,
"bedrooms": 3,
"building_size": 210,
"car_spaces": 2,
"land_size": 150,
"agency_name": "The City Townhouse",
"alternate_sku": "39131996",
"attribute_set_name": "House",
"currency_code": "PHP",
"listing_start": "2021-02-09 11:26:05",
"name": "Brandnew Modern Duplex 2 Car Garage House and Lot for Sale in Betterliving-MD",
"price": 11300000,
"product_owner": 558921,
"product_owner_name": "The City Townhouse",
"product_owner_url_key": "miyabi-realty-agn-8",
"root_category": 15,
"sku": "HO602200CD8819DPH",
"status": "active",
"status_supplier_config": "active",
"subcategory": "Townhouse",
"subcategory_id": 19,
"offer_type_id": 2,
"location_latitude": "14.482705",
"location_longitude": "121.024909",
"listing_region": "Metro Manila",
"offer_type": "Buy",
"listing_area": "Don Bosco",
"listing_city": "Para\u00f1aque",
"image_url": "https://static-ph.lamudi.com/static/media/bm9uZS9ub25l/2x2x5x880x396/ac7e9e4327f459.jpg"
},
"country": "ph",
"description": {
"characters": 438,
"text": "Duplex 2 Storey Townhouse3 Bedrooms3 Toilet &amp; BathMaids RoomLiving RoomDinning RoomKitchen AreaBalconyMaster Bedrooms w/ walk in closetSELLING PRICE:11.3MLot Area:150 sqmFloor Area:210sqmTerms Of Payment:400K Reservation20% Downpayment80% Bank Finance",
"words": 31
},
"device_agent": "",
"device_type": "desktop",
"images": 15,
"language": "en",
"location": {
"area": "Don Bosco",
"city": "Para\u00f1aque",
"region": "Metro Manila"
},
"login_state": false,
"number_of_product_attributes": 49,
"number_of_product_features": 3,
"sku": "HO602200CD8819DPH",
"title": "Brandnew Modern Duplex 2 Car Garage House and Lot for Sale in Betterliving-MD",
"page_type": "product"
}
]
如果它是有效的 JSON,您可以使用 json 内置模块,特别是解析字符串的json.loads()
函数。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.