aboutsummaryrefslogtreecommitdiffstats
path: root/parse_received.py
diff options
context:
space:
mode:
Diffstat (limited to 'parse_received.py')
-rw-r--r--parse_received.py490
1 files changed, 490 insertions, 0 deletions
diff --git a/parse_received.py b/parse_received.py
new file mode 100644
index 0000000..f9d106b
--- /dev/null
+++ b/parse_received.py
@@ -0,0 +1,490 @@
+# -*- coding: utf-8 -*-
+import dateutil.parser
+import email
+import email.utils
+from pprint import pformat
+import logging
+logging.basicConfig(format='%(levelname)s:%(funcName)s:%(message)s',
+ level=logging.DEBUG)
+
+import rply
+
+
+#logging.basicConfig(level=logging.DEBUG)
+
+# from http://search.cpan.org/~simon/Email-Received-1.00/lib/Email/Received.pm
+# possible keys are
+# ip rdns helo ident envfrom auth by id
+
+# from RFC2821
+"""
+4.4 Trace Information
+
+
+ When an SMTP server receives a message for delivery or further
+ processing, it MUST insert trace ("time stamp" or "Received")
+ information at the beginning of the message content, as discussed in
+ section 4.1.1.4.
+
+ This line MUST be structured as follows:
+
+ - The FROM field, which MUST be supplied in an SMTP environment,
+ SHOULD contain both (1) the name of the source host as presented
+ in the EHLO command and (2) an address literal containing the IP
+ address of the source, determined from the TCP connection.
+
+ - The ID field MAY contain an "@" as suggested in RFC 822, but this
+ is not required.
+
+ - The FOR field MAY contain a list of <path> entries when multiple
+ RCPT commands have been given. This may raise some security
+ issues and is usually not desirable; see section 7.2.
+
+ An Internet mail program MUST NOT change a Received: line that was
+ previously added to the message header. SMTP servers MUST prepend
+ Received lines to messages; they MUST NOT change the order of
+ existing lines or insert Received lines in any other location.
+
+ As the Internet grows, comparability of Received fields is important
+ for detecting problems, especially slow relays. SMTP servers that
+ create Received fields SHOULD use explicit offsets in the dates
+ (e.g., -0800), rather than time zone names of any type. Local time
+ (with an offset) is preferred to UT when feasible. This formulation
+ allows slightly more information about local circumstances to be
+ specified. If UT is needed, the receiver need merely do some simple
+ arithmetic to convert the values. Use of UT loses information about
+ the time zone-location of the server. If it is desired to supply a
+ time zone name, it SHOULD be included in a comment.
+
+ When the delivery SMTP server makes the "final delivery" of a
+ message, it inserts a return-path line at the beginning of the mail
+ data. This use of return-path is required; mail systems MUST support
+ it. The return-path line preserves the information in the <reverse-
+ path> from the MAIL command. Here, final delivery means the message
+ has left the SMTP environment. Normally, this would mean it had been
+ delivered to the destination user or an associated mail drop, but in
+ some cases it may be further processed and transmitted by another
+ mail system.
+
+ It is possible for the mailbox in the return path to be different
+ from the actual sender's mailbox, for example, if error responses are
+ to be delivered to a special error handling mailbox rather than to
+ the message sender. When mailing lists are involved, this
+ arrangement is common and useful as a means of directing errors to
+ the list maintainer rather than the message originator.
+
+ The text above implies that the final mail data will begin with a
+ return path line, followed by one or more time stamp lines. These
+ lines will be followed by the mail data headers and body [32].
+
+ It is sometimes difficult for an SMTP server to determine whether or
+ not it is making final delivery since forwarding or other operations
+ may occur after the message is accepted for delivery. Consequently,
+ any further (forwarding, gateway, or relay) systems MAY remove the
+ return path and rebuild the MAIL command as needed to ensure that
+ exactly one such line appears in a delivered message.
+
+ A message-originating SMTP system SHOULD NOT send a message that
+ already contains a Return-path header. SMTP servers performing a
+ relay function MUST NOT inspect the message data, and especially not
+ to the extent needed to determine if Return-path headers are present.
+ SMTP servers making final delivery MAY remove Return-path headers
+ before adding their own.
+
+ The primary purpose of the Return-path is to designate the address to
+ which messages indicating non-delivery or other mail system failures
+ are to be sent. For this to be unambiguous, exactly one return path
+ SHOULD be present when the message is delivered. Systems using RFC
+ 822 syntax with non-SMTP transports SHOULD designate an unambiguous
+ address, associated with the transport envelope, to which error
+ reports (e.g., non-delivery messages) should be sent.
+
+ Historical note: Text in RFC 822 that appears to contradict the use
+ of the Return-path header (or the envelope reverse path address from
+ the MAIL command) as the destination for error messages is not
+ applicable on the Internet. The reverse path address (as copied into
+ the Return-path) MUST be used as the target of any mail containing
+ delivery error messages.
+
+ In particular:
+
+ - a gateway from SMTP->elsewhere SHOULD insert a return-path header,
+ unless it is known that the "elsewhere" transport also uses
+ Internet domain addresses and maintains the envelope sender
+ address separately.
+
+ - a gateway from elsewhere->SMTP SHOULD delete any return-path
+ header present in the message, and either copy that information to
+ the SMTP envelope or combine it with information present in the
+ envelope of the other transport system to construct the reverse
+ path argument to the MAIL command in the SMTP envelope.
+
+ The server must give special treatment to cases in which the
+ processing following the end of mail data indication is only
+ partially successful. This could happen if, after accepting several
+ recipients and the mail data, the SMTP server finds that the mail
+ data could be successfully delivered to some, but not all, of the
+ recipients. In such cases, the response to the DATA command MUST be
+ an OK reply. However, the SMTP server MUST compose and send an
+ "undeliverable mail" notification message to the originator of the
+ message.
+
+ A single notification listing all of the failed recipients or
+ separate notification messages MUST be sent for each failed
+ recipient. For economy of processing by the sender, the former is
+ preferred when possible. All undeliverable mail notification
+ messages are sent using the MAIL command (even if they result from
+ processing the obsolete SEND, SOML, or SAML commands) and use a null
+ return path as discussed in section 3.7.
+
+ The time stamp line and the return path line are formally defined as
+ follows:
+"""
+
+# from
+
+"""
+3.1.5. Path
+
+
+ The Path header field indicates the route taken by an article since
+ its injection into the Netnews system. Each agent that processes an
+ article is required to prepend at least one <path-identity> to this
+ header field body. This is primarily so that news servers are able
+ to avoid sending articles to sites already known to have them, in
+ particular the site they came from. Additionally, it permits
+ gathering statistics and tracing the route articles take in moving
+ over the network.
+
+ path = "Path:" SP *WSP path-list tail-entry *WSP CRLF
+
+ path-list = *( path-identity [FWS] [path-diagnostic] "!" )
+
+ path-diagnostic = diag-match / diag-other / diag-deprecated
+
+ diag-match = "!" ; another "!"
+
+ diag-other = "!." diag-keyword [ "." diag-identity ] [FWS]
+
+ diag-deprecated = "!" IPv4address [FWS]
+
+ diag-keyword = 1*ALPHA ; see [RFC5537]
+
+ diag-identity = path-identity / IPv4address / IPv6address
+
+ tail-entry = path-nodot
+ ; may be the string "not-for-mail"
+
+ path-identity = ( 1*( label "." ) toplabel ) / path-nodot
+
+ path-nodot = 1*( alphanum / "-" / "_" ) ; legacy names
+
+ label = alphanum [ *( alphanum / "-" ) alphanum ]
+
+ toplabel = ( [ label *( "-" ) ] ALPHA *( "-" ) label ) /
+ ( label *( "-" ) ALPHA [ *( "-" ) label ] ) /
+ ( label 1*( "-" ) label )
+
+ alphanum = ALPHA / DIGIT ; compare [RFC3696]
+
+ A <path-identity> is a name identifying a site. It takes the form of
+ a domain name having two or more components separated by dots, or a
+ single name with no dots (<path-nodot>).
+
+ Each <path-identity> in the <path-list> (which does not include the
+ <tail-entry>) indicates, from right to left, the successive agents
+ through which the article has passed. The use of the <diag-match>,
+ which appears as "!!", indicates that the agent to its left verified
+ the identity of the agent to its right before accepting the article
+ (whereas the <path-delimiter> "!" implies no such claim).
+
+ NOTE: Historically, the <tail-entry> indicated the name of the
+ sender. If not used for this purpose, the string "not-for-mail"
+ is often used instead (since at one time the whole path could be
+ used as a mail address for the sender).
+
+ NOTE: Although case-insensitive, it is intended that the
+ <diag-keyword>s should be in uppercase, to distinguish them from
+ the <path-identity>s, which are traditionally in lowercase.
+
+ A <path-diagnostic> is an item inserted into the Path header field
+ for purposes other than to indicate the name of a site. The use of
+ these is described in [RFC5537].
+
+ NOTE: One usage of a <path-diagnostic> is to record an IP address.
+ The fact that <IPv6address>es are allowed means that the colon (:)
+ is permitted; note that this may cause interoperability problems
+ at older sites that regard ":" as a <path-delimiter> and have
+ neighbors whose names have 4 or fewer characters, and where all
+ the characters are valid HEX digits.
+
+ NOTE: Although <IPv4address>es have occasionally been used in the
+ past (usually with a diagnostic intent), their continued use is
+ deprecated (though it is still acceptable in the form of the
+ <diag-deprecated>).
+"""
+
+
+__lg = rply.LexerGenerator()
+# Add takes a rule name, and a regular expression that defines the rule.
+__lg.ignore(r"\s+")
+__lg.ignore(r'[\n;=\)\(]+')
+__lg.add('FROMSEP', r"(?i)\bfrom\b")
+__lg.add('BYSEP', r"(?i)\bby\b")
+__lg.add('VIASEP', r"(?i)\bvia\b")
+__lg.add('WITHSEP', r"(?i)\bwith\b")
+__lg.add('FORSEP', r"(?i)\bfor\b")
+__lg.add('IDSEP', r"(?i)\bid\b")
+__lg.add('TCPSEP', r"(?i)\btcp\b")
+__lg.add('SMTPSEP', r"(?i)\b[e]?smtp\b")
+__lg.add('IPV4ADDRESS', r"\[?\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\]?")
+__lg.add('SMTPVERSION', r"\d{1,2}\.\d{1,2}\.\d{1,2}/\d{1,2}\.\d{1,2}\.\d{1,2}")
+__lg.add('DOMAIN', r"(?i)[<(]?(([\w][\w\-\.]*)\.)?([\w][\w\-]+)*" +
+ r"(\.([a-z][a-z]*))[>)]?")
+__weekday = r"(Mon|Tue|Wed|Thu|Fri|Sat|Sun)"
+__month_name = r"(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)"
+__lg.add('DATETIME', (r"%s\s*,\s+\d{1,2}\s+%s\s+\d{4} " +
+ r"\d{1,2}:\d{2}:\d{2}\s+[-+]\d{4}(\s*\([A-Z]{3}\))?") %
+ (__weekday, __month_name))
+__lg.add("EMAILADDR", r'<[a-zA-Z0-9_.+-]+@[a-zA-Z0-9._-]+>')
+__lg.add('IGNORABLESTR', r"may be forged")
+# __lg.add('ParentStr', r"\([^)]+\)")
+__lg.add('STRING', r"\S+")
+
+lexer = __lg.build()
+
+__pg = rply.ParserGenerator([rule.name for rule in lexer.rules],
+ cache_id='received_parser')
+
+"""
+Result of:
+
+from server.mymailhost.com
+ (mail.mymailhost.com [126.43.75.123])
+ by pilot01.cl.msu.edu (8.10.2/8.10.2) with ESMTP id NAA23597;
+ Fri, 12 Jul 2002 16:11:20 -0400 (EDT)
+
+is
+
+[[Token('FROMSEP', 'from'),
+ [Token('DOMAIN', 'server.mymailhost.com'),
+ [[Token('DOMAIN', 'mail.mymailhost.com'),
+ Token('IPV4ADDRESS', '[126.43.75.123]')]]]],
+ [Token('BYSEP', 'by'),
+ [Token('DOMAIN', 'pilot01.cl.msu.edu'),
+ [[Token('STRING', '8.10.2/8.10.2)')]]]],
+ [[[], [[Token('SMTPSEP', 'ESMTP')]], [Token('STRING', 'NAA23597;')], []]],
+ Token('DATETIME', 'Fri, 12 Jul 2002 16:11:20 -0400 (EDT)')]
+"""
+
+
+@__pg.production('main : stamp')
+def main(p):
+ return p[0]
+
+
+@__pg.production('stamp : from-domain by-domain optinfo DATETIME')
+def stamp(p):
+ #logging.debug('stamp p = %s', p)
+ return [p[0], p[1], p[2], p[3]]
+
+
+@__pg.production('from-domain : FROMSEP extended-domain')
+def from_domain(p):
+ #logging.debug('from_domain p = %s', p)
+ return [p[0], p[1]]
+
+
+@__pg.production('by-domain : BYSEP extended-domain')
+def by_domain(p):
+ #logging.debug('by_domain p = %s', p)
+ return [p[0], p[1]]
+
+
+@__pg.production('extended-domain : DOMAIN')
+def extended_domain(p):
+ #logging.debug('extended_domain p = %s', p)
+ return p[0]
+
+
+@__pg.production('extended-domain : DOMAIN tcp-info')
+def extended_domain_tcp(p):
+ #logging.debug('extended_domain_tcp p = %s', p)
+ return [p[0], p[1]]
+
+
+@__pg.production('extended-domain : IPV4ADDRESS tcp-info')
+def extended_domain_addr(p):
+ #logging.debug('extended_domain_addr p = %s', p)
+ return [p[0], p[1]]
+
+
+@__pg.production('extended-domain : extended-domain IGNORABLESTR')
+def extended_domain_ignorable(p):
+ #logging.debug('extended_domain_ignorable p = %s', p)
+ return p[0]
+
+
+@__pg.production('tcp-info : IPV4ADDRESS')
+def tcp_info(p):
+ #logging.debug('optinfo p = %s', p)
+ return p
+
+
+@__pg.production('tcp-info : DOMAIN IPV4ADDRESS')
+def tcp_info_addr(p):
+ #logging.debug('optinfo p = %s', p)
+ return p
+
+
+@__pg.production('tcp-info : SMTPVERSION')
+def tcp_info_string(p):
+ #logging.debug('optinfo p = %s', p)
+ return p
+
+
+@__pg.production('optinfo : via with id for')
+def optinfo(p):
+ #logging.debug('optinfo p = %s', p)
+ return [p]
+
+
+@__pg.production('via : ')
+def via_empty(p):
+ #logging.debug('optinfo_via p = %s', p)
+ return p
+
+
+@__pg.production('via : VIASEP link')
+def via(p):
+ #logging.debug('optinfo_via p = %s', p)
+ return [p[2]]
+
+
+@__pg.production('with :')
+def with_empty(p):
+ #logging.debug('optinfo_with p = %s', p)
+ return p
+
+
+@__pg.production('with : WITHSEP protocol')
+def with_protocol(p):
+ #logging.debug('optinfo_with p = %s', p)
+ return p[1]
+
+
+@__pg.production('id : ')
+def id_empty(p):
+ #logging.debug('optinfo_id p = %s', p)
+ return p
+
+
+@__pg.production('id : IDSEP STRING')
+def id(p):
+ #logging.debug('optinfo_id p = %s', p)
+ return [p[1]]
+
+
+# Actually EMAILADDR is Message-Id
+@__pg.production('id : IDSEP EMAILADDR')
+def id_addr(p):
+ #logging.debug('id_addr p = %s', p)
+ return [p[1]]
+
+
+@__pg.production('for : ')
+def for_empty(p):
+ #logging.debug('for p = %s', p)
+ return p
+
+
+# For = "FOR" FWS 1*( Path / Mailbox ) CFWS
+@__pg.production('for : FORSEP EMAILADDR')
+def optinfo_for(p):
+ #logging.debug('for p = %s', p)
+ return [p[0]]
+
+
+@__pg.production('link : TCPSEP STRING')
+def link(p):
+ #logging.debug('link p = %s', p)
+ return [p[2]]
+
+
+@__pg.production('protocol : SMTPSEP')
+def protocol(p):
+ #logging.debug('protocol p = %s', p)
+ return p
+
+
+parser = __pg.build()
+
+
+# from http://stackoverflow.com/questions/27450365/
+def sort_key(received_header):
+ received_date = email.utils.parsedate_tz(received_header)
+ return received_date
+
+# [[Token('FROMSEP', 'from'),
+# [Token('DOMAIN', 'server.mymailhost.com'),
+# [[Token('DOMAIN', 'mail.mymailhost.com'),
+# Token('IPV4ADDRESS', '[126.43.75.123]')]]]],
+# [Token('BYSEP', 'by'),
+# [Token('DOMAIN', 'pilot01.cl.msu.edu'),
+# Token('SMTPVERSION', '8.10.2/8.10.2')]],
+# [[[], [Token('SMTPSEP', 'ESMTP')], [Token('STRING', 'NAA23597;')], []]],
+# Token('DATETIME', 'Fri, 12 Jul 2002 16:11:20 -0400 (EDT)')]
+#
+# [[Token('FROMSEP', 'from'),
+# [Token('DOMAIN', 'd1080.master.cz'),
+# [[Token('DOMAIN', 'p-lab.cz'),
+# Token('IPV4ADDRESS', '[89.185.245.149]')]]]],
+# [Token('BYSEP', 'by'),
+# [Token('DOMAIN', 'mx1.redhat.com'),
+# Token('SMTPVERSION', '8.14.4/8.14.4')]],
+# [[[],
+# [Token('SMTPSEP', 'ESMTP')],
+# [Token('STRING', 't07GaC1j031854')],
+# [Token('FORSEP', 'for')]]],
+# Token('DATETIME', 'Wed, 7 Jan 2015 11:36:13 -0500')]
+
+
+def parse_header(in_str):
+ # Path: cbosgd!mhuxj!mhuxt!eagle!jerry
+ stream = lexer.lex(in_str)
+ logging.debug('\nstream:\n%s', list(stream))
+ stream.idx = 0
+ parsed = parser.parse(stream)
+ logging.debug('\nparsed:\n%s', pformat(parsed))
+
+ from_list = parsed[0][1]
+ logging.debug('\nfrom_list = %s', from_list[1])
+ by_list = parsed[1][1]
+ logging.debug('by_list = %s', by_list)
+ with_list = parsed[2]
+ logging.debug('with_list = %s', with_list)
+ date_str = parsed[3].getstr()
+ logging.debug('date_str = %s', date_str)
+
+ out = {
+ 'from': {
+ 'halo': from_list[0].getstr(),
+ 'reveresed': from_list[1][0].getstr(),
+ 'ipaddr': from_list[1][1].getstr()
+ },
+ 'by': {'server': by_list[0].getstr()},
+ 'date': dateutil.parser.parse(date_str)
+ }
+
+ return out
+
+received_header_list = []
+received_header_list.sort(key=sort_key)
+
+if __name__ == '__main__':
+ instr = """from d1080.master.cz (p-lab.cz [89.185.245.149] (may be forged))
+ by mx1.redhat.com (8.14.4/8.14.4) with ESMTP id t07GaC1j031854
+ for <mcepl@redhat.com>; Wed, 7 Jan 2015 11:36:13 -0500"""
+ print(instr)
+ for token in lexer.lex(instr):
+ print(token)