2015-05-03 3 views
0

Я кодирую игрушку PageRank, в том числе и гусеничный. Это выглядит немного странно, так как мой код не сходится к значениям PR. можно также отметить, что дельта между каждой итерации равен 0, часть продукции будет:PageRank пример игрушки не сходится

url: http://en.m.wikipedia.org/wiki/Israel_State_Cup 
links_to_node: set(['http://en.m.wikipedia.org/wiki/Association_football', 'http://en.m.wikipedia.org/wiki/Wikipedia:General_disclaimer']) 
links_from_node: set(['http://en.m.wikipedia.org/wiki/Israel_State_Cup']) 
PR_score: 2.41759524248e+38 
ttl_time: 1 
last_delta: 0 

Код выглядит следующим образом:

import requests 
import lxml.html 
import random 

class pr_node: 
     """WDM PR node""" 
     url = "" 
     links_to_node = set([]) 
     links_from_node = set([]) 
     PR_score = 0.0001 
     ttl_time = 0 
     last_delta = 0 

     def __init__(self, url, ttl_time): 
      self.url = url 
      self.links_to_node = set([]) 
      self.links_from_node = set([]) 
      self.PR_score = 0.1 
      self.ttl_time = ttl_time 

     def print_node_out_links(self): 
      print "\n\n" + self.url + " with ttl " + str(self.ttl_time) + " = " 
      s = self.links_to_node 
      print "{" + "\, ".join(str(e) for e in s) + "}" 

     def print_node_pr(self): 
      print "\n\n" + self.url + " PR is: " + str(self.PR_score) 

     def print_all(self): 
      print "url: " + self.url 
      print "links_to_node: " + repr(self.links_to_node) 
      print "links_from_node: " + repr(self.links_from_node) 
      print "PR_score: " + str(self.PR_score) 
      print "ttl_time: " + str(self.ttl_time) 
      print "last_delta: " + str(self.last_delta) 



def crawl(url, url_ttl): 
     """crawl to new url, if ttl == 0 max depth reached, don't visit same url twice""" 
     if url_ttl > 0 and (url not in visited_urls): 

      # create new node p from parsed page 
      print "crawling to " + url + "...\n" 
      res = requests.get(url) 
      doc = lxml.html.fromstring(res.content) 
      p = pr_node(url, url_ttl) 

      # add new PR node 
      global pr_nodes 
      pr_nodes[url] = p 

      # get all wiki links 
      all_links_to_node = set([]) 
      for t in doc.xpath("//a[contains(@href, '/wiki/')]"): 
       add_val = "" 
       if not t.attrib['href'].startswith("http://") and t.attrib['href'].startswith("/wiki/"): 
        add_val = "http://en.m.wikipedia.org" + t.attrib['href'] 
        all_links_to_node.add(add_val) 
       elif t.attrib['href'].startswith("http://"): 
        add_val = t.attrib['href'] 
        all_links_to_node.add(add_val) 
       else: 
        pass 

      # select random 10 of them and crawl to them 
      iter_count = 0 
      iter_stop_lim = 10 
      while iter_count < iter_stop_lim and len(p.links_to_node) < iter_stop_lim and len(all_links_to_node) > 0: 
        current_url = random.sample(all_links_to_node, 1)[0] 

        all_links_to_node.remove(current_url) # don't do it twice... 
        iter_count = + 1 
        if not (current_url in visited_urls) and url_ttl > 1: 
         p.links_to_node.add(current_url) 
         crawl(current_url, url_ttl - 1) 
         visited_urls.add(url) 
        elif current_url in visited_urls and url_ttl == 1: 
         p.links_to_node.add(current_url) 

     else: 
      print "max depth reached or you've already been here" 
     return 


def calc_graph_pr(pr_iter_count, damp_factor): 
    "print calculating PageRank" 
    current_iter = 0 
    global pr_nodes 

    g1 = {} 
    g2 = {} 
    for node in pr_nodes.itervalues(): 
     g1[node.url] = node 
     g2[node.url] = node 

    g = [g1, g2] 

    while current_iter < pr_iter_count: 
     print "PageRank iteration #" + str(current_iter) 
     for p in g[current_iter % 2].itervalues(): 
      in_links_addition = 0 
      for l in p.links_to_node: 
       l_val = g[(current_iter - 1) % 2][l] 
       l_val.delta = l_val.PR_score - g[current_iter % 2][l].PR_score 
       in_links_addition += l_val.PR_score/len(l_val.links_from_node) 
      p.PR_score = damp_factor + (1 - damp_factor) * in_links_addition 
     current_iter += 1 

    pr_nodes = g[0] #WLOG could be also g[1]... 

    for p in pr_nodes.itervalues(): 
     p.print_all() 

    print "check bool:" 
    print g1 == g2 
    return 


def update_graph_links(): 
    global pr_nodes 
    for node in pr_nodes.itervalues(): 
     for u in node.links_to_node: 
      if u in pr_nodes: 
       pr_nodes[u].links_from_node.add(u) 
    return 

visited_urls = set([]) 
pr_nodes = {} 

glob_pr_iter_count = 50 
glob_damp_factor = 0.2 

crawl("http://en.m.wikipedia.org/wiki/Nikola_Mitrovic", 3) 

update_graph_links() 
calc_graph_pr(glob_pr_iter_count, glob_damp_factor) 
+0

Так было бы просто продолжать навсегда? Я вижу, когда вычисляется дельта (разница между текущим PR-счетом и формой PR-итоговой последней итерации), но я не вижу, чтобы вы использовали это значение для любого использования. Говорят, что график, как говорят, сходится, когда дельта любого узла меньше 0,0001, должен быть некоторый код, который заканчивает итерации, когда это произойдет. – jksnw

+0

@jksnw он еще не используется, но он демонстрирует, что в этом фрагменте кода что-то гнилое. – GalB1t

ответ

1

Это добавление функции край, который испортил все. Фиксированный это:

def update_graph_links(): 
    """register each node with neighbours pointing at it""" 
    global pr_nodes 
    for node in pr_nodes.itervalues(): 
     for u in node.links_to_node: 
      if u in pr_nodes: 
       pr_nodes[u].links_from_node.add(node.url) 
    return 

После нескольких корректировок, некоторые рефакторинга и добавления собственных комментариев он подошел к следующему коду:

import requests 
import lxml.html 
import random 
import sys 

class pr_node: 
     """WDM PR node""" 

     url = "" 
     links_to_node = set([]) 
     links_from_node = set([]) 
     PR_score = 0.01 
     ttl_time = 0 
     last_delta = 0 # used for debug only 

     def __init__(self, url, ttl_time): 
      """CTOR""" 
      self.url = url 
      self.links_to_node = set([]) 
      self.links_from_node = set([]) 
      self.PR_score = 0.01 
      self.ttl_time = ttl_time 


     def print_node_out_links(self): 
      """print for q1a""" 
      print "\n\n" + self.url + " with ttl " + str(self.ttl_time) + " = " 
      s = self.links_to_node 
      print "{" + "\, ".join(str(e) for e in s) + "}" 


     def print_node_pr(self): 
      """print for q1b""" 
      print "\n\n" + self.url + " PR is: " + str(self.PR_score) 


     def print_all(self): 
      """print for q1b and debug""" 
      print "url: " + self.url 
      print "links_to_node: " + repr(self.links_to_node) 
      print "links_from_node: " + repr(self.links_from_node) 
      print "PR_score: " + str(self.PR_score) 
      print "ttl_time: " + str(self.ttl_time) 
      print "last_delta: " + str(self.last_delta) 


def crawl(url, url_ttl): 
     """crawl to new url, if ttl == 0 max depth reached, don't visit same url twice""" 
     if url_ttl > 0 and (url not in visited_urls): 

      # create new node p from parsed page 
      print "crawling to " + url + "...\n" 
      res = requests.get(url) 
      doc = lxml.html.fromstring(res.content) 
      p = pr_node(url, url_ttl) 

      # add new PR node 
      global pr_nodes 
      pr_nodes[url] = p 

      # get all wiki links, format to legit URL 
      all_links_to_node = set([]) 
      for t in doc.xpath("//a[contains(@href, '/wiki/')]"): 
       add_val = "" 
       if not t.attrib['href'].startswith("http://") and t.attrib['href'].startswith("/wiki/"): 
        add_val = "http://en.m.wikipedia.org" + t.attrib['href'] 
        all_links_to_node.add(add_val) 
       elif t.attrib['href'].startswith("http://"): 
        add_val = t.attrib['href'] 
        all_links_to_node.add(add_val) 
       else: 
        pass 

      # select random 10 of them and crawl to them 
      iter_count = 0 
      iter_stop_lim = 10 
      while iter_count < iter_stop_lim and len(p.links_to_node) < iter_stop_lim and len(all_links_to_node) > 0: 
        # sample random site of linked sites 
        current_url = random.sample(all_links_to_node, 1)[0] 
        # don't sample it twice... 
        all_links_to_node.remove(current_url) 
        iter_count = + 1 

        # crawl if hav'nt been there and TTL enables you to check it 
        if not (current_url in visited_urls) and url_ttl > 1: 
         p.links_to_node.add(current_url) 
         crawl(current_url, url_ttl - 1) 
         visited_urls.add(url) 

        # if reached with TTL == 1 just check links to existing nodes 
        elif current_url in visited_urls and url_ttl == 1: 
         p.links_to_node.add(current_url) 

     else: 
      print "max depth reached or you've already been here" 
     return 


def calc_graph_pr(pr_nodes, pr_iter_count, damp_factor): 
    """calculate and print the graph's PageRank""" 
    current_iter = 0 

    # use two graph copies to prevent auto-interference 
    g1 = {} 
    g2 = {} 
    for node in pr_nodes.itervalues(): 
     g1[node.url] = node 
     g2[node.url] = node 

    g = [g1, g2] 

    # do actual page rank here 
    while current_iter < pr_iter_count: 
     for p in g[current_iter % 2].itervalues(): 
      in_links_addition = 0 
      # iterate over all pointing nodes and sum their PR/out_link_count 
      for l in p.links_to_node: 
       l_val = g[(current_iter - 1) % 2][l] 
       l_val.delta = l_val.PR_score - g[current_iter % 2][l].PR_score 
       in_links_addition += l_val.PR_score/len(l_val.links_from_node) 
      # update w.r.t the computed sum and damp_factor 
      p.PR_score = damp_factor + (1 - damp_factor) * in_links_addition 
     current_iter += 1 

    # WLOG could be also g[1]... 
    pr_nodes = g[0] 

    for p in pr_nodes.itervalues(): 
     p.print_node_pr() 

    return 


def update_graph_links(): 
    """register each node with neighbours pointing at him""" 
    global pr_nodes 
    for node in pr_nodes.itervalues(): 
     for u in node.links_to_node: 
      if u in pr_nodes: 
       pr_nodes[u].links_from_node.add(node.url) 
    return 


if __name__ == '__main__': 

    urlToCrawl = "http://en.m.wikipedia.org/wiki/Nikola_Mitrovic" 

    # crawl to the requested site as default 
    if len(sys.argv) > 2: 
     sys.exit("Unexpected input") 
    elif len(sys.argv) == 1: 
     pass 
    else: 
     urlToCrawl = sys.argv[1] 

    print_q1a = False 
    print_q1b = True 

    # set global data structures for crawling and ranking 
    visited_urls = set([]) 
    pr_nodes = {} 

    # parameters for PageRank 
    glob_pr_iter_count = 100 
    glob_damp_factor = 0.2 

    # perform crawl in depth 3 
    crawl(urlToCrawl, 3) 

    if print_q1a: 
     for p in pr_nodes.itervalues(): 
      p.print_node_out_links() 

    elif print_q1b: 
     # first update the backlinks then start ranking 
     update_graph_links() 
     calc_graph_pr(pr_nodes, glob_pr_iter_count, glob_damp_factor) 
    else: 
     pass 
Смежные вопросы