簡體   English   中英

如何解析來自不同網站的多個XML(RSS)以進行單個處理

[英]How to parse multiple XMLs (rss) from different websites for a single processing

我正在嘗試從每個不同的網站解析多個XML(RSS,而不是API)以進行單個分析。 (多個輸入,單個結果集)每種XML在要提取的xpath中都有一點差異。

我也想過濾掉一些不應該出現的詞。 目前,來自一個在線xml的單詞頻率有效。

如何使這項工作更簡單?

import urllib.request
with urllib.request.urlopen('http://python.org/') as response:
   html = response.read()

import MySQLdb
import math
import random
import requests
import collections
import string
import re
import xml.etree.ElementTree as ET
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier
from string import punctuation
from collections import defaultdict
from collections import Counter    

def main(n=10):

        # Download the content

        #NYArtbeat
    #    contents1 = requests.get('http://www.nyartbeat.com/list/event_type_print_painting.en.xml')
    #    root=ET.fromstring(contents1.content)
    #    descs=[element.text for element in root.findall('.//description')]

        #FriezeMag
    #    contents1 = requests.get('http://feeds.feedburner.com/FriezeMagazineUniversal?format=xml')
    #    root=ET.fromstring(contents1.content)
    #    descs=[element.text for element in root.findall('.//description')]

        #Art Education
        contents = requests.get('http://www.artandeducation.net/category/announcement/feed/')
        root=ET.fromstring(contents.content)
        descs=[element.text for element in root.findall('.//description')]

        #Blouinartinfo
    #    contents1 = requests.get('http://www.blouinartinfo.com/rss/visual-arts.xml')
    #    root=ET.fromstring(contents1.content)
    #    descs=[element.text for element in root.findall('.//description')]

        #Art Agenda
    #    contents1 = requests.get('http://www.art-agenda.com/category/reviews/feed/')
    #    root=ET.fromstring(contents1.content)
    #    descs=[element.text for element in root.findall('.///.*')]




        # Clean the content a little

        filterWords = set(['artist', 'artists'])

        contents=",".join(map(str, descs))
        contents = re.sub('\s+', ' ', contents)  
        contents = re.sub('[^A-Za-z ]+', '', contents)  

        words=[w.lower() for w in contents.split() if len(w) >=6 ]


     #   fliteredWords=set(fliteredWords)-filterWords 


        # Start counting
        word_count = Counter(words)

        # The Top-N words
        print("The Top {0} words".format(n))
        for word, count in word_count.most_common(n):
            print("{0}: {1}".format(word, count))



    if __name__ == "__main__":
        main()

您可能要創建一個提要及其xpath的列表,以便可以循環處理它們並使用一個函數對其進行處理。 這是一個可以滿足您需求的示例。 請注意如何輕松地添加任意數量的提要並指定xpath。 您提供的所有樣本的xpath均為.//description但第一個樣本實際具有.//Description但是您可以通過將路徑添加到feeds列表中來輕松處理路徑為.//body或其他任何內容的feeds

import requests, re
from xml.etree import ElementTree
from collections import Counter

def main(n=10):

    # A list of feeds to process and their xpath
    feeds = [
        {'url': 'http://www.nyartbeat.com/list/event_type_print_painting.en.xml', 'xpath': './/Description'},
        {'url': 'http://feeds.feedburner.com/FriezeMagazineUniversal?format=xml', 'xpath': './/description'},
        {'url': 'http://www.artandeducation.net/category/announcement/feed/', 'xpath': './/description'},
        {'url': 'http://www.blouinartinfo.com/rss/visual-arts.xml', 'xpath': './/description'},
        {'url': 'http://www.art-agenda.com/category/reviews/feed/', 'xpath': './/description'}
    ]

    # A place to hold all feed results
    results = []

    # Loop all the feeds
    for feed in feeds:
        # Append feed results together
        results = results + process(feed['url'], feed['xpath'])

    # Join all results into a big string
    contents=",".join(map(str, results))

    # Remove double+ spaces
    contents = re.sub('\s+', ' ', contents)

    # Remove everything that is not a character or whitespace
    contents = re.sub('[^A-Za-z ]+', '', contents)

    # Create a list of lower case words that are at least 6 characters
    words=[w.lower() for w in contents.split() if len(w) >=6 ]

    # Count the words
    word_count = Counter(words)

    # Clean the content a little
    filter_words = ['artist', 'artists']
    for word in filter_words:
        if word in word_count:
            del word_count[word]

    # And the survey says...
    print("The Top {0} words".format(n))
    for word, count in word_count.most_common(n):
        print("{0}: {1}".format(word, count))

def process(url, xpath):
    """
    Downloads a feed url and extracts the results with a variable path
    :param url: string
    :param xpath: string
    :return: list
    """
    contents = requests.get(url)
    root = ElementTree.fromstring(contents.content)
    return [element.text.encode('utf8') if element.text is not None else '' for element in root.findall(xpath)]

if __name__ == "__main__":
    main()

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM