1 files changed, 490 insertions, 0 deletions
diff --git a/parse_received.py b/parse_received.py
new file mode 100644
index 0000000..f9d106b
--- /dev/null
+++ b/parse_received.py
@@ -0,0 +1,490 @@
+# -*- coding: utf-8 -*-
+import dateutil.parser
+import email
+import email.utils
+from pprint import pformat
+import logging
+logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s',
+                    level=logging.DEBUG)
+
+import rply
+
+
+#logging.basicConfig(level=logging.DEBUG)
+
+# from http://search.cpan.org/~simon/Email-Received-1.00/lib/Email/Received.pm
+# possible keys are
+# ip rdns helo ident envfrom auth by id
+
+# from RFC2821
+"""
+4.4 Trace Information
+
+
+   When an SMTP server receives a message for delivery or further
+   processing, it MUST insert trace ("time stamp" or "Received")
+   information at the beginning of the message content, as discussed in
+   section 4.1.1.4.
+
+   This line MUST be structured as follows:
+
+   -  The FROM field, which MUST be supplied in an SMTP environment,
+      SHOULD contain both (1) the name of the source host as presented
+      in the EHLO command and (2) an address literal containing the IP
+      address of the source, determined from the TCP connection.
+
+   -  The ID field MAY contain an "@" as suggested in RFC 822, but this
+      is not required.
+
+   -  The FOR field MAY contain a list of <path> entries when multiple
+      RCPT commands have been given.  This may raise some security
+      issues and is usually not desirable; see section 7.2.
+
+   An Internet mail program MUST NOT change a Received: line that was
+   previously added to the message header.  SMTP servers MUST prepend
+   Received lines to messages; they MUST NOT change the order of
+   existing lines or insert Received lines in any other location.
+
+   As the Internet grows, comparability of Received fields is important
+   for detecting problems, especially slow relays.  SMTP servers that
+   create Received fields SHOULD use explicit offsets in the dates
+   (e.g., -0800), rather than time zone names of any type.  Local time
+   (with an offset) is preferred to UT when feasible.  This formulation
+   allows slightly more information about local circumstances to be
+   specified.  If UT is needed, the receiver need merely do some simple
+   arithmetic to convert the values.  Use of UT loses information about
+   the time zone-location of the server.  If it is desired to supply a
+   time zone name, it SHOULD be included in a comment.
+
+   When the delivery SMTP server makes the "final delivery" of a
+   message, it inserts a return-path line at the beginning of the mail
+   data.  This use of return-path is required; mail systems MUST support
+   it.  The return-path line preserves the information in the <reverse-
+   path> from the MAIL command.  Here, final delivery means the message
+   has left the SMTP environment.  Normally, this would mean it had been
+   delivered to the destination user or an associated mail drop, but in
+   some cases it may be further processed and transmitted by another
+   mail system.
+
+   It is possible for the mailbox in the return path to be different
+   from the actual sender's mailbox, for example, if error responses are
+   to be delivered to a special error handling mailbox rather than to
+   the message sender.  When mailing lists are involved, this
+   arrangement is common and useful as a means of directing errors to
+   the list maintainer rather than the message originator.
+
+   The text above implies that the final mail data will begin with a
+   return path line, followed by one or more time stamp lines.  These
+   lines will be followed by the mail data headers and body [32].
+
+   It is sometimes difficult for an SMTP server to determine whether or
+   not it is making final delivery since forwarding or other operations
+   may occur after the message is accepted for delivery.  Consequently,
+   any further (forwarding, gateway, or relay) systems MAY remove the
+   return path and rebuild the MAIL command as needed to ensure that
+   exactly one such line appears in a delivered message.
+
+   A message-originating SMTP system SHOULD NOT send a message that
+   already contains a Return-path header.  SMTP servers performing a
+   relay function MUST NOT inspect the message data, and especially not
+   to the extent needed to determine if Return-path headers are present.
+   SMTP servers making final delivery MAY remove Return-path headers
+   before adding their own.
+
+   The primary purpose of the Return-path is to designate the address to
+   which messages indicating non-delivery or other mail system failures
+   are to be sent.  For this to be unambiguous, exactly one return path
+   SHOULD be present when the message is delivered.  Systems using RFC
+   822 syntax with non-SMTP transports SHOULD designate an unambiguous
+   address, associated with the transport envelope, to which error
+   reports (e.g., non-delivery messages) should be sent.
+
+   Historical note: Text in RFC 822 that appears to contradict the use
+   of the Return-path header (or the envelope reverse path address from
+   the MAIL command) as the destination for error messages is not
+   applicable on the Internet.  The reverse path address (as copied into
+   the Return-path) MUST be used as the target of any mail containing
+   delivery error messages.
+
+   In particular:
+
+   -  a gateway from SMTP->elsewhere SHOULD insert a return-path header,
+      unless it is known that the "elsewhere" transport also uses
+      Internet domain addresses and maintains the envelope sender
+      address separately.
+
+   -  a gateway from elsewhere->SMTP SHOULD delete any return-path
+      header present in the message, and either copy that information to
+      the SMTP envelope or combine it with information present in the
+      envelope of the other transport system to construct the reverse
+      path argument to the MAIL command in the SMTP envelope.
+
+   The server must give special treatment to cases in which the
+   processing following the end of mail data indication is only
+   partially successful.  This could happen if, after accepting several
+   recipients and the mail data, the SMTP server finds that the mail
+   data could be successfully delivered to some, but not all, of the
+   recipients.  In such cases, the response to the DATA command MUST be
+   an OK reply.  However, the SMTP server MUST compose and send an
+   "undeliverable mail" notification message to the originator of the
+   message.
+
+   A single notification listing all of the failed recipients or
+   separate notification messages MUST be sent for each failed
+   recipient.  For economy of processing by the sender, the former is
+   preferred when possible.  All undeliverable mail notification
+   messages are sent using the MAIL command (even if they result from
+   processing the obsolete SEND, SOML, or SAML commands) and use a null
+   return path as discussed in section 3.7.
+
+   The time stamp line and the return path line are formally defined as
+   follows:
+"""
+
+# from
+
+"""
+3.1.5. Path
+
+
+   The Path header field indicates the route taken by an article since
+   its injection into the Netnews system.  Each agent that processes an
+   article is required to prepend at least one <path-identity> to this
+   header field body.  This is primarily so that news servers are able
+   to avoid sending articles to sites already known to have them, in
+   particular the site they came from.  Additionally, it permits
+   gathering statistics and tracing the route articles take in moving
+   over the network.
+
+   path            =  "Path:" SP *WSP path-list tail-entry *WSP CRLF
+
+   path-list       =  *( path-identity [FWS] [path-diagnostic] "!" )
+
+   path-diagnostic =  diag-match / diag-other / diag-deprecated
+
+   diag-match      =  "!"          ; another "!"
+
+   diag-other      =  "!." diag-keyword [ "." diag-identity ] [FWS]
+
+   diag-deprecated =  "!" IPv4address [FWS]
+
+   diag-keyword    =  1*ALPHA      ; see [RFC5537]
+
+   diag-identity   =  path-identity / IPv4address / IPv6address
+
+   tail-entry      =  path-nodot
+                      ; may be the string "not-for-mail"
+
+   path-identity   =  ( 1*( label "." ) toplabel ) / path-nodot
+
+   path-nodot      =  1*( alphanum / "-" / "_" )   ; legacy names
+
+   label           =  alphanum [ *( alphanum / "-" ) alphanum ]
+
+   toplabel        =  ( [ label *( "-" ) ] ALPHA *( "-" ) label ) /
+                      ( label *( "-" ) ALPHA [ *( "-" ) label ] ) /
+                      ( label 1*( "-" ) label )
+
+   alphanum        =  ALPHA / DIGIT        ; compare [RFC3696]
+
+   A <path-identity> is a name identifying a site.  It takes the form of
+   a domain name having two or more components separated by dots, or a
+   single name with no dots (<path-nodot>).
+
+   Each <path-identity> in the <path-list> (which does not include the
+   <tail-entry>) indicates, from right to left, the successive agents
+   through which the article has passed.  The use of the <diag-match>,
+   which appears as "!!", indicates that the agent to its left verified
+   the identity of the agent to its right before accepting the article
+   (whereas the <path-delimiter> "!" implies no such claim).
+
+      NOTE: Historically, the <tail-entry> indicated the name of the
+      sender.  If not used for this purpose, the string "not-for-mail"
+      is often used instead (since at one time the whole path could be
+      used as a mail address for the sender).
+
+      NOTE: Although case-insensitive, it is intended that the
+      <diag-keyword>s should be in uppercase, to distinguish them from
+      the <path-identity>s, which are traditionally in lowercase.
+
+   A <path-diagnostic> is an item inserted into the Path header field
+   for purposes other than to indicate the name of a site.  The use of
+   these is described in [RFC5537].
+
+      NOTE: One usage of a <path-diagnostic> is to record an IP address.
+      The fact that <IPv6address>es are allowed means that the colon (:)
+      is permitted; note that this may cause interoperability problems
+      at older sites that regard ":" as a <path-delimiter> and have
+      neighbors whose names have 4 or fewer characters, and where all
+      the characters are valid HEX digits.
+
+      NOTE: Although <IPv4address>es have occasionally been used in the
+      past (usually with a diagnostic intent), their continued use is
+      deprecated (though it is still acceptable in the form of the
+      <diag-deprecated>).
+"""
+
+
+__lg = rply.LexerGenerator()
+# Add takes a rule name, and a regular expression that defines the rule.
+__lg.ignore(r"\s+")
+__lg.ignore(r'[\n;=\)\(]+')
+__lg.add('FROMSEP', r"(?i)\bfrom\b")
+__lg.add('BYSEP', r"(?i)\bby\b")
+__lg.add('VIASEP', r"(?i)\bvia\b")
+__lg.add('WITHSEP', r"(?i)\bwith\b")
+__lg.add('FORSEP', r"(?i)\bfor\b")
+__lg.add('IDSEP', r"(?i)\bid\b")
+__lg.add('TCPSEP', r"(?i)\btcp\b")
+__lg.add('SMTPSEP', r"(?i)\b[e]?smtp\b")
+__lg.add('IPV4ADDRESS', r"\[?\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\]?")
+__lg.add('SMTPVERSION', r"\d{1,2}\.\d{1,2}\.\d{1,2}/\d{1,2}\.\d{1,2}\.\d{1,2}")
+__lg.add('DOMAIN', r"(?i)[<(]?(([\w][\w\-\.]*)\.)?([\w][\w\-]+)*" +
+         r"(\.([a-z][a-z]*))[>)]?")
+__weekday = r"(Mon|Tue|Wed|Thu|Fri|Sat|Sun)"
+__month_name = r"(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)"
+__lg.add('DATETIME', (r"%s\s*,\s+\d{1,2}\s+%s\s+\d{4} " +
+                      r"\d{1,2}:\d{2}:\d{2}\s+[-+]\d{4}(\s*\([A-Z]{3}\))?") %
+                     (__weekday, __month_name))
+__lg.add("EMAILADDR", r'<[a-zA-Z0-9_.+-]+@[a-zA-Z0-9._-]+>')
+__lg.add('IGNORABLESTR', r"may be forged")
+# __lg.add('ParentStr', r"\([^)]+\)")
+__lg.add('STRING', r"\S+")
+
+lexer = __lg.build()
+
+__pg = rply.ParserGenerator([rule.name for rule in lexer.rules],
+                            cache_id='received_parser')
+
+"""
+Result of:
+
+from server.mymailhost.com
+    (mail.mymailhost.com [126.43.75.123])
+    by pilot01.cl.msu.edu (8.10.2/8.10.2) with ESMTP id NAA23597;
+    Fri, 12 Jul 2002 16:11:20 -0400 (EDT)
+
+is
+
+[[Token('FROMSEP', 'from'),
+  [Token('DOMAIN', 'server.mymailhost.com'),
+   [[Token('DOMAIN', 'mail.mymailhost.com'),
+     Token('IPV4ADDRESS', '[126.43.75.123]')]]]],
+ [Token('BYSEP', 'by'),
+  [Token('DOMAIN', 'pilot01.cl.msu.edu'),
+   [[Token('STRING', '8.10.2/8.10.2)')]]]],
+ [[[], [[Token('SMTPSEP', 'ESMTP')]], [Token('STRING', 'NAA23597;')], []]],
+ Token('DATETIME', 'Fri, 12 Jul 2002 16:11:20 -0400 (EDT)')]
+"""
+
+
+@__pg.production('main : stamp')
+def main(p):
+    return p[0]
+
+
+@__pg.production('stamp : from-domain by-domain optinfo DATETIME')
+def stamp(p):
+    #logging.debug('stamp p = %s', p)
+    return [p[0], p[1], p[2], p[3]]
+
+
+@__pg.production('from-domain : FROMSEP extended-domain')
+def from_domain(p):
+    #logging.debug('from_domain p = %s', p)
+    return [p[0], p[1]]
+
+
+@__pg.production('by-domain : BYSEP extended-domain')
+def by_domain(p):
+    #logging.debug('by_domain p = %s', p)
+    return [p[0], p[1]]
+
+
+@__pg.production('extended-domain : DOMAIN')
+def extended_domain(p):
+    #logging.debug('extended_domain p = %s', p)
+    return p[0]
+
+
+@__pg.production('extended-domain : DOMAIN tcp-info')
+def extended_domain_tcp(p):
+    #logging.debug('extended_domain_tcp p = %s', p)
+    return [p[0], p[1]]
+
+
+@__pg.production('extended-domain : IPV4ADDRESS tcp-info')
+def extended_domain_addr(p):
+    #logging.debug('extended_domain_addr p = %s', p)
+    return [p[0], p[1]]
+
+
+@__pg.production('extended-domain : extended-domain IGNORABLESTR')
+def extended_domain_ignorable(p):
+    #logging.debug('extended_domain_ignorable p = %s', p)
+    return p[0]
+
+
+@__pg.production('tcp-info : IPV4ADDRESS')
+def tcp_info(p):
+    #logging.debug('optinfo p = %s', p)
+    return p
+
+
+@__pg.production('tcp-info : DOMAIN IPV4ADDRESS')
+def tcp_info_addr(p):
+    #logging.debug('optinfo p = %s', p)
+    return p
+
+
+@__pg.production('tcp-info : SMTPVERSION')
+def tcp_info_string(p):
+    #logging.debug('optinfo p = %s', p)
+    return p
+
+
+@__pg.production('optinfo : via with id for')
+def optinfo(p):
+    #logging.debug('optinfo p = %s', p)
+    return [p]
+
+
+@__pg.production('via : ')
+def via_empty(p):
+    #logging.debug('optinfo_via p = %s', p)
+    return p
+
+
+@__pg.production('via : VIASEP link')
+def via(p):
+    #logging.debug('optinfo_via p = %s', p)
+    return [p[2]]
+
+
+@__pg.production('with :')
+def with_empty(p):
+    #logging.debug('optinfo_with p = %s', p)
+    return p
+
+
+@__pg.production('with : WITHSEP protocol')
+def with_protocol(p):
+    #logging.debug('optinfo_with p = %s', p)
+    return p[1]
+
+
+@__pg.production('id : ')
+def id_empty(p):
+    #logging.debug('optinfo_id p = %s', p)
+    return p
+
+
+@__pg.production('id : IDSEP STRING')
+def id(p):
+    #logging.debug('optinfo_id p = %s', p)
+    return [p[1]]
+
+
+# Actually EMAILADDR is Message-Id
+@__pg.production('id : IDSEP EMAILADDR')
+def id_addr(p):
+    #logging.debug('id_addr p = %s', p)
+    return [p[1]]
+
+
+@__pg.production('for : ')
+def for_empty(p):
+    #logging.debug('for p = %s', p)
+    return p
+
+
+# For = "FOR" FWS 1*( Path / Mailbox ) CFWS
+@__pg.production('for : FORSEP EMAILADDR')
+def optinfo_for(p):
+    #logging.debug('for p = %s', p)
+    return [p[0]]
+
+
+@__pg.production('link : TCPSEP STRING')
+def link(p):
+    #logging.debug('link p = %s', p)
+    return [p[2]]
+
+
+@__pg.production('protocol : SMTPSEP')
+def protocol(p):
+    #logging.debug('protocol p = %s', p)
+    return p
+
+
+parser = __pg.build()
+
+
+# from http://stackoverflow.com/questions/27450365/
+def sort_key(received_header):
+    received_date = email.utils.parsedate_tz(received_header)
+    return received_date
+
+# [[Token('FROMSEP', 'from'),
+#   [Token('DOMAIN', 'server.mymailhost.com'),
+#    [[Token('DOMAIN', 'mail.mymailhost.com'),
+#      Token('IPV4ADDRESS', '[126.43.75.123]')]]]],
+#  [Token('BYSEP', 'by'),
+#   [Token('DOMAIN', 'pilot01.cl.msu.edu'),
+#    Token('SMTPVERSION', '8.10.2/8.10.2')]],
+#  [[[], [Token('SMTPSEP', 'ESMTP')], [Token('STRING', 'NAA23597;')], []]],
+#  Token('DATETIME', 'Fri, 12 Jul 2002 16:11:20 -0400 (EDT)')]
+#
+# [[Token('FROMSEP', 'from'),
+#   [Token('DOMAIN', 'd1080.master.cz'),
+#    [[Token('DOMAIN', 'p-lab.cz'),
+#      Token('IPV4ADDRESS', '[89.185.245.149]')]]]],
+#  [Token('BYSEP', 'by'),
+#   [Token('DOMAIN', 'mx1.redhat.com'),
+#    Token('SMTPVERSION', '8.14.4/8.14.4')]],
+#  [[[],
+#    [Token('SMTPSEP', 'ESMTP')],
+#    [Token('STRING', 't07GaC1j031854')],
+#    [Token('FORSEP', 'for')]]],
+#  Token('DATETIME', 'Wed, 7 Jan 2015 11:36:13 -0500')]
+
+
+def parse_header(in_str):
+    # Path: cbosgd!mhuxj!mhuxt!eagle!jerry
+    stream = lexer.lex(in_str)
+    logging.debug('\nstream:\n%s', list(stream))
+    stream.idx = 0
+    parsed = parser.parse(stream)
+    logging.debug('\nparsed:\n%s', pformat(parsed))
+
+    from_list = parsed[0][1]
+    logging.debug('\nfrom_list = %s', from_list[1])
+    by_list = parsed[1][1]
+    logging.debug('by_list = %s', by_list)
+    with_list = parsed[2]
+    logging.debug('with_list = %s', with_list)
+    date_str = parsed[3].getstr()
+    logging.debug('date_str = %s', date_str)
+
+    out = {
+        'from': {
+            'halo': from_list[0].getstr(),
+            'reveresed': from_list[1][0].getstr(),
+            'ipaddr': from_list[1][1].getstr()
+        },
+        'by': {'server': by_list[0].getstr()},
+        'date': dateutil.parser.parse(date_str)
+    }
+
+    return out
+
+received_header_list = []
+received_header_list.sort(key=sort_key)
+
+if __name__ == '__main__':
+    instr = """from d1080.master.cz (p-lab.cz [89.185.245.149] (may be forged))
+        by mx1.redhat.com (8.14.4/8.14.4) with ESMTP id t07GaC1j031854
+        for <mcepl@redhat.com>; Wed, 7 Jan 2015 11:36:13 -0500"""
+    print(instr)
+    for token in lexer.lex(instr):
+        print(token)