aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMatěj Cepl <mcepl@redhat.com>2013-12-28 00:27:09 +0100
committerMatěj Cepl <mcepl@redhat.com>2013-12-28 00:51:43 +0100
commit58ec4876d8dd638de14c2ccb3d959c40eadfe2d8 (patch)
tree02e70224d63e15ef137b967a55d8f24abe56bee1
parent9dcfa6e10d512cb767dac9c410c96072f7cbd166 (diff)
downloadgg_scraper-58ec4876d8dd638de14c2ccb3d959c40eadfe2d8.tar.gz
Convert to python3.
-rwxr-xr-x[-rw-r--r--]bs_test.py6
-rwxr-xr-xgg_scrapper.py63
-rw-r--r--test-log.txt15
-rw-r--r--test_gg_scrapper.py10
4 files changed, 50 insertions, 44 deletions
diff --git a/bs_test.py b/bs_test.py
index 5b6a840..58b7529 100644..100755
--- a/bs_test.py
+++ b/bs_test.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/python3
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import sys
@@ -9,6 +9,6 @@ TOPIC_COUNT_RE = re.compile(r'\D+ \d+ - \d+ \D+ (\d+) \D+$')
bs = BeautifulSoup(open(sys.argv[1]))
i_str = bs.find_all('i')[0].string
-print("i = %s" % i_str)
+print("i = {}".format(i_str))
count = int(TOPIC_COUNT_RE.match(i_str).group(1))
-print("match i = %d" % count)
+print("match i = {0:d}".format(count))
diff --git a/gg_scrapper.py b/gg_scrapper.py
index 650810c..37f7a13 100755
--- a/gg_scrapper.py
+++ b/gg_scrapper.py
@@ -1,7 +1,9 @@
-#!/usr/bin/python
+#!/usr/bin/python3
import re
-import urllib2
+import urllib.request
+import urllib.error
+import urllib.parse
from bs4 import BeautifulSoup
import logging
logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s',
@@ -10,24 +12,15 @@ logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s',
TOPIC_COUNT_RE = re.compile(r'\D+ \d+ - \d+ \D+ (\d+) \D+$')
-class Topic(object):
- def __init__(self, URL, name):
- self.name = name
- self.root = URL # root of the discussion
-
- def __unicode__(self):
- return "%s: %s" % (self.root, self.name)
-
-
-class GooglePage(object):
- verb_handler = urllib2.HTTPHandler()
+class Page(object):
+ verb_handler = urllib.request.HTTPHandler()
if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
verb_handler.set_http_debuglevel(2)
- redir_handler = urllib2.HTTPRedirectHandler()
- opener = urllib2.build_opener(verb_handler, redir_handler)
+ redir_handler = urllib.request.HTTPRedirectHandler()
+ opener = urllib.request.build_opener(verb_handler, redir_handler)
- def __init__(self, URL):
- self.bs_page = self.get_first_page_BS(URL)
+ def __init__(self):
+ pass
@staticmethod
def unenscape_Google_bang_URL(old_URL):
@@ -51,15 +44,40 @@ class GooglePage(object):
logging.debug('url = {}'.format(new_URL))
return cls.unenscape_Google_bang_URL(new_URL)
else:
- raise urllib2.HTTPError('Unknown URL: {}'.format(URL))
+ raise urllib.error.HTTPError('Unknown URL: {}'.format(URL))
- def get_first_page_BS(self, URL):
+ def _get_page_BS(self, URL):
res = self.opener.open(self.do_redirect(URL))
in_str = res.read()
bs = BeautifulSoup(in_str)
res.close()
return bs
+
+class Article(Page):
+ def __init__(self):
+ super(Article, self).__init__()
+
+
+class Topic(Page):
+ def __init__(self, URL, name):
+ super(Topic, self).__init__()
+ self.name = name
+ self.root = URL
+
+ def __unicode__(self):
+ return "%s: %s" % (self.root, self.name)
+
+ def get_articles(self):
+ page = self._get_page_BS(self.root)
+ page = page
+
+
+class Group(Page):
+ def __init__(self, URL):
+ super(Group, self).__init__()
+ self.group_URL = URL
+
def get_count_topics(self, BS):
'''Get total number of topics from the number on the page
itself.
@@ -74,13 +92,14 @@ class GooglePage(object):
i_str = i_elem[0].string
return int(TOPIC_COUNT_RE.match(i_str).group(1))
- def get_topics(self, BS):
+ def get_topics(self):
'''Recursively[?] get all topic (as special objects)
Also return (for error checking) number of topics from the head
of the topic page.
'''
out = []
other = []
+ BS = self._get_page_BS(self.group_URL)
for a_elem in BS.find_all('a'):
if 'title' in a_elem.attrs:
# filter out all-non-topic <a>s
@@ -92,8 +111,8 @@ class GooglePage(object):
other.append(a_elem)
if len(other) == 1:
- new_bs = BeautifulSoup(self.opener.open(other[0]['href']).read())
- out.extend(self.get_topics(new_bs))
+ new_bs = Group(other[0]['href'])
+ out.extend(new_bs.get_topics())
elif len(other) != 0:
raise ValueError(
'There must be either one or none link to the next page!')
diff --git a/test-log.txt b/test-log.txt
index 56a123b..980cfce 100644
--- a/test-log.txt
+++ b/test-log.txt
@@ -1128,19 +1128,6 @@ DEBUG:unenscape_Google_bang_URL:esc_URL = https://groups.google.com/forum/?_esca
ok
----------------------------------------------------------------------
-Ran 3 tests in 36.116s
+Ran 3 tests in 31.608s
OK
-send: u'GET /d/forum/jbrout HTTP/1.1\r\nAccept-Encoding: identity\r\nHost: groups.google.com\r\nConnection: close\r\nUser-Agent: Python-urllib/2.7\r\n\r\n'
-reply: 'HTTP/1.1 302 Moved Temporarily\r\n'
-header: Location: https://groups.google.com/d/forum/jbrout
-header: Content-Type: text/html; charset=UTF-8
-header: Date: Fri, 27 Dec 2013 03:31:46 GMT
-header: Expires: Fri, 27 Dec 2013 03:31:46 GMT
-header: Cache-Control: private, max-age=0
-header: X-Content-Type-Options: nosniff
-header: X-Frame-Options: SAMEORIGIN
-header: X-XSS-Protection: 1; mode=block
-header: Server: GSE
-header: Alternate-Protocol: 80:quic
-header: Connection: close
diff --git a/test_gg_scrapper.py b/test_gg_scrapper.py
index 1a75aab..10e7f4a 100644
--- a/test_gg_scrapper.py
+++ b/test_gg_scrapper.py
@@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-
-from __future__ import unicode_literals, print_function, absolute_import
+
import logging
import unittest
import gg_scrapper
@@ -12,16 +12,16 @@ EXP_URL = 'https://groups.google.com/forum/' + \
class TestGGScrapper(unittest.TestCase):
def test_URL_conversion(self):
- obs_URL = gg_scrapper.GooglePage.unenscape_Google_bang_URL(IN_URL)
+ obs_URL = gg_scrapper.Group.unenscape_Google_bang_URL(IN_URL)
self.assertEqual(obs_URL, EXP_URL)
def test_do_redirect(self):
- obs_URL = gg_scrapper.GooglePage.do_redirect(ORIG_URL)
+ obs_URL = gg_scrapper.Group.do_redirect(ORIG_URL)
self.assertEqual(obs_URL, EXP_URL)
def test_collecting_topics(self):
- page = gg_scrapper.GooglePage(IN_URL)
- topics = page.get_topics(page.bs_page)
+ page = gg_scrapper.Group(IN_URL)
+ topics = page.get_topics()
logging.debug("number of topics = %d", len(topics))
self.assertGreater(len(topics), 0)