aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIzidor Matušov <izidor.matusov@gmail.com>2014-02-08 00:55:54 +0000
committerMatěj Cepl <mcepl@redhat.com>2014-02-13 02:13:39 +0100
commit691a5d554757b6ccb45baa70691f0a730c5f5a0c (patch)
tree8cbb01b303b7525e6d7e75eb713e13dfd95e9561
parenta787996df1c6332e87e316885a27d6433e3767df (diff)
downloadgg_scraper-691a5d554757b6ccb45baa70691f0a730c5f5a0c.tar.gz
Ignore links in welcome message
Some groups, e.g. django-oscar [1], have links in welcome message. Those are not supposed to be a link to the next page, ignore them. 1: https://groups.google.com/forum/#!forum/django-oscar
-rwxr-xr-xgg_scraper.py4
1 files changed, 3 insertions, 1 deletions
diff --git a/gg_scraper.py b/gg_scraper.py
index cb6bc4e..640d304 100755
--- a/gg_scraper.py
+++ b/gg_scraper.py
@@ -220,9 +220,11 @@ class Group(Page):
BS = self._get_page_BS(target_stack.pop(0))
for a_elem in BS.find_all('a'):
is_topic, res = self.get_one_topic(a_elem)
+ # Ignore link in welcome message, e.g. django-oscar group
+ is_welcomemsg = a_elem.get('target') == 'welcomeMsg'
if is_topic:
out.append(res)
- else:
+ elif not is_welcomemsg:
other.append(res)
if len(other) == 1: