#! /usr/bin/python # Copyright (C) 2001-2016 by the Free Software Foundation, Inc. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, # USA. """Clean up an .mbox archive file. The archiver looks for Unix-From lines separating messages in an mbox archive file. For compatibility, it specifically looks for lines that start with "From " -- i.e. the letters capital-F, lowercase-r, o, m, space, ignoring everything else on the line. Normally, any lines that start "From " in the body of a message should be escaped such that a > character is actually the first on a line. It is possible though that body lines are not actually escaped. This script attempts to fix these by doing a stricter test of the Unix-From lines. Any lines that start "From " but do not pass this stricter test are escaped with a > character. This in an enhanced version of the standard cleanarch script. It works just like the normal cleanarch but it also looks at lines that look like Date: headers and if they don't have a parseable date within the setting for ARCHIVER_ALLOWABLE_SANE_DATE_SKEW, replaces the value with the date from the Unix-From. Usage: cleanarch2 [options] < inputfile > outputfile Options: -s n --status=n Print a # character every n lines processed -q / --quiet Don't print changed line information to standard error. -n / --dry-run Don't actually output anything. -h / --help Print this message and exit """ import re import sys import time import getopt import mailbox from email.Utils import parsedate, formatdate import paths from Mailman import mm_cfg from Mailman.i18n import _ cre = re.compile(mailbox.UnixMailbox._fromlinepattern) # From RFC 2822, a header field name must contain only characters from 33-126 # inclusive, excluding colon. I.e. from oct 41 to oct 176 less oct 072. fre = re.compile(r'^[\041-\071\073-\176]+:') def usage(code, msg=''): if code: fd = sys.stderr else: fd = sys.stdout print >> fd, _(__doc__) if msg: print >> fd, msg sys.exit(code) def escape_line(line, lineno, quiet, output): if output: sys.stdout.write('>' + line) if not quiet: print >> sys.stderr, _('Unix-From line changed: %(lineno)d') print >> sys.stderr, line[:-1] def check_date(line, lastfrom, lineno, quiet, output): lfdate = parsedate(re.sub(r'^From \s*\S+\s+', '', lastfrom, flags=re.I)) ddate = parsedate(re.sub(r'^Date:\s*', '', line, flags=re.I)) if ddate: try: dsecs = time.mktime(ddate) except (OverflowError, ValueError): dsecs = 0 else: dsecs = 0 # lfdate should be good because it looks like a valid unixfrom lfsecs = time.mktime(lfdate) if abs(dsecs - lfsecs) <= mm_cfg.ARCHIVER_ALLOWABLE_SANE_DATE_SKEW: if output: sys.stdout.write(line) return # Date: is off. Replace it with unix from date newline = 'Date: ' + formatdate(lfsecs) + '\n' if output: sys.stdout.write(newline) if not quiet: print >> sys.stderr, _( 'Date: changed %(lineno)d\n%(line)s%(newline)s'), def main(): inheader = False try: opts, args = getopt.getopt( sys.argv[1:], 'hqns:', ['help', 'quiet', 'dry-run', 'status=']) except getopt.error, msg: usage(1, msg) quiet = False output = True status = -1 for opt, arg in opts: if opt in ('-h', '--help'): usage(0) elif opt in ('-q', '--quiet'): quiet = True elif opt in ('-n', '--dry-run'): output = False elif opt in ('-s', '--status'): try: status = int(arg) except ValueError: usage(1, _('Bad status number: %(arg)s')) if args: usage(1) lineno = 0 statuscnt = 0 messages = 0 prevline = None while True: lineno += 1 line = sys.stdin.readline() if not line: break if line.startswith('From '): if cre.match(line): # This is a real Unix-From line. But it could be a message # /about/ Unix-From lines, so as a second order test, make # sure there's at least one RFC 2822 header following nextline = sys.stdin.readline() lineno += 1 if not nextline: # It was the last line of the mbox, so it couldn't have # been a Unix-From escape_line(line, lineno, quiet, output) break fieldname = nextline.split(':', 1) if len(fieldname) < 2 or not fre.match(nextline): # The following line was not a header, so this wasn't a # valid Unix-From escape_line(line, lineno, quiet, output) if output: sys.stdout.write(nextline) else: # It's a valid Unix-From line messages += 1 lastfrom = line inheader = True if output: # Before we spit out the From_ line, make sure the # previous line was blank. if prevline is not None and prevline <> '\n': sys.stdout.write('\n') sys.stdout.write(line) sys.stdout.write(nextline) else: # This is a bogus Unix-From line escape_line(line, lineno, quiet, output) else: # Any old line if inheader and line.lower().startswith('date:'): check_date(line, lastfrom, lineno, quiet, output) elif output: sys.stdout.write(line) if not line.strip(): inheader = False if status > 0 and (lineno % status) == 0: sys.stderr.write('#') statuscnt += 1 if statuscnt > 50: print >> sys.stderr statuscnt = 0 prevline = line print >> sys.stderr, _('%(messages)d messages found') if __name__ == '__main__': main()