gg_scrapper.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120

#!/usr/bin/python3

import re
import urllib.request
import urllib.error
import urllib.parse
from bs4 import BeautifulSoup
import logging
logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s',
                    level=logging.DEBUG)

TOPIC_COUNT_RE = re.compile(r'\D+ \d+ - \d+ \D+ (\d+) \D+$')


class Page(object):
    verb_handler = urllib.request.HTTPHandler()
    if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
        verb_handler.set_http_debuglevel(2)
    redir_handler = urllib.request.HTTPRedirectHandler()
    opener = urllib.request.build_opener(verb_handler, redir_handler)

    def __init__(self):
        pass

    @staticmethod
    def unenscape_Google_bang_URL(old_URL):
        """
        See https://developers.google.com/webmasters\
                /ajax-crawling/docs/getting-started for more information
        """
        if old_URL.find('#!') != -1:
            esc_URL = old_URL.replace('#!', '?_escaped_fragment_=')
            logging.debug('esc_URL = {}'.format(esc_URL))
            return esc_URL
        else:
            return old_URL

    @classmethod
    def do_redirect(cls, URL):
        res = cls.opener.open(URL)

        if res.getcode() == 200:
            new_URL = res.geturl()
            logging.debug('url = {}'.format(new_URL))
            return cls.unenscape_Google_bang_URL(new_URL)
        else:
            raise urllib.error.HTTPError('Unknown URL: {}'.format(URL))

    def _get_page_BS(self, URL):
        res = self.opener.open(self.do_redirect(URL))
        in_str = res.read()
        bs = BeautifulSoup(in_str)
        res.close()
        return bs


class Article(Page):
    def __init__(self):
        super(Article, self).__init__()


class Topic(Page):
    def __init__(self, URL, name):
        super(Topic, self).__init__()
        self.name = name
        self.root = URL

    def __unicode__(self):
        return "%s: %s" % (self.root, self.name)

    def get_articles(self):
        page = self._get_page_BS(self.root)
        page = page


class Group(Page):
    def __init__(self, URL):
        super(Group, self).__init__()
        self.group_URL = URL

    def get_count_topics(self, BS):
        '''Get total number of topics from the number on the page
        itself.

        Which would be awesome for control, except it is wrong on all
        pages in various and different ways. :(
        '''
        i_elem = BS.find_all('i')
        if len(i_elem) <= 0:
            raise ValueError('Cannot find count of topics!')

        i_str = i_elem[0].string
        return int(TOPIC_COUNT_RE.match(i_str).group(1))

    def get_topics(self):
        '''Recursively[?] get all topic (as special objects)
        Also return (for error checking) number of topics from the head
        of the topic page.
        '''
        out = []
        other = []
        BS = self._get_page_BS(self.group_URL)
        for a_elem in BS.find_all('a'):
            if 'title' in a_elem.attrs:
                # filter out all-non-topic <a>s
                logging.debug('href = %s', a_elem['href'])
                logging.debug('title = %s', a_elem['title'])
                out.append(Topic(a_elem['href'], a_elem['title']))
            else:
                logging.debug('other = %s', a_elem)
                other.append(a_elem)

        if len(other) == 1:
            new_bs = Group(other[0]['href'])
            out.extend(new_bs.get_topics())
        elif len(other) != 0:
            raise ValueError(
                'There must be either one or none link to the next page!')

        return out