简体   繁体   中英

How to break a loop without breaking the loop containing it?

I tried to write a code that defines to classes: Webpage, and crwaler. the object: to build a generic search engine. I got on my pc a folder with a couple of "web_pages" in the form of html files. .im sorry for all the code, but i dont know how to make this question specific without it.

import re
import os

def remove_html_tags(s):
    tag = False
    quote = False
    out = ""

    for c in s:
            if c == '<' and not quote:
                tag = True
            elif c == '>' and not quote:
                tag = False
            elif (c == '"' or c == "'") and tag:
                quote = not quote
            elif not tag:
                out = out + c

    return out


def lev(s1, s2):
    return lev_iter(s1, s2, dict())

def lev_iter(s1, s2, mem):

    (i,j) = (len(s1), len(s2))
    if (i,j) in mem:
        return mem[(i,j)]

    s1_low = s1.lower()
    s2_low = s2.lower()
    if len(s1_low) == 0 or len(s2_low) == 0:
        return max(len(s1_low), len(s2_low))
    d1 = lev_iter(s1_low[:-1], s2_low, mem) + 1
    d2 = lev_iter(s1_low, s2_low[:-1], mem) + 1
    last = 0 if s1_low[-1] == s2_low[-1] else 1
    d3 = lev_iter(s1_low[:-1], s2_low[:-1], mem) + last
    result = min(d1, d2, d3)

    mem[(i,j)] = result

    return result




""" A Class that holds data on a Web page """
class WebPage:

    def __init__(self, filename):

        self.filename = filename

    def process(self):

        f = open(self.filename,'r')
        LINE_lst = f.readlines()

        self.info = {}

        for i in range(len(LINE_lst)):
            LINE_lst[i] = LINE_lst[i].strip(' \n\t')
            LINE_lst[i] = remove_html_tags(LINE_lst[i])
        lines = LINE_lst[:]
        for line in lines:
            if len(line) == 0:
                LINE_lst.remove(line)
        self.body = ' '.join(LINE_lst[1:])
        self.title = LINE_lst[0]
        f.close()

    def __str__(self):
        return self.title + '\n' + self.body

    def __repr__(self):
        return self.title

    def __eq__(self,other):
        n = lev(self.body,other.body)
        k = len(self.body)
        m = len(other.body)
        return float(n)/max(k,m) <= 0.15

    def __lt__(self,other):
        return self.title < other.title

""" A Class that crawls the web """     
class Crawler:
    def __init__(self, directory):

        self.folder = directory

    def crawl(self):

        pages = [f for f in os.listdir(self.folder) if f.endswith('.html')]

        final_list = []

        for page in pages:

            page = WebPage(self.folder + '\\' + page)
            page.process()

            for k in range(len(final_list)+1):

                if k == len(final_list):

                    final_list.append(page)

                elif page == final_list[k]:
                    if page < final_list[k]:
                        final_list.remove(final_list[k])
                        final_list.append(page)

        self.crawl = final_list

well.. everything works besides the crawl method. maybe im doing it all wrong i dont know. i want to break the loop whenever k is equal to to the lenth of final_list, but without breaking the loop containing it. any sugestions?

That's what the break keyword does . It only breaks the innermost loop.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM