#! /usr/bin/python # Copyright (C) 2001-2015 by the Free Software Foundation, Inc. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, # USA. """Check a mbox archive for various anomalies and report them. Optionally, check that the message sequence numbers in the mbox match against a list's pipermail archive. Usage: check_arch [options] listname [mbox] Options: -n / --nomatch Just check Date: header and From_ lines. Don't check archive for corresponding html files. This is incompatible with the -s/--subject option. -s / --subject In addition to checking whether an file exists in the archive, check if the Subject: in the html file matches the message. -v / --verbose Report the entire 'bad' message from the mbox, not just some headers. -h / --help Print this message and exit listname is required. mbox is the mbox archive, defaults to archives/private/listname.mbox/listname.mbox This script checks the mbox file and looks for and reports messages with non-standard From_ separators (possible unescaped From_ lines in message bodies), messages with no headers following the From_ line and messages with missing or invalid Date: headers. It also checks and reports messages in the mbox for which no corresponding message with the same sequence number exists in the appropriate period in the HTML archive unless suppressed with the -n/--nomatch option and will optionally check if the HTML Subject: matches the mbox Subject. """ import os import re import cgi import sys import email import getopt import mailbox import paths from Mailman import Utils from Mailman import Errors from Mailman import Mailbox from Mailman import MailList from Mailman.i18n import _ from Mailman.Archiver.HyperArch import HyperArchive cre = re.compile(mailbox.UnixMailbox._fromlinepattern) REpat = re.compile( r"\s*RE\s*(\[\d+\]\s*)?:\s*", re.IGNORECASE) cset = 'us-ascii' def usage(code, msg=''): if code: fd = sys.stderr else: fd = sys.stdout print >> fd, _(__doc__) if msg: print >> fd, msg sys.exit(code) def fix_subj(s): """Various clean ups for subject matching.""" s = Utils.oneline(s, cset) # Trim Re: from the subject line (Copied from HyperArch.py) i = 0 while i != -1: result = REpat.match(s) if result: i = result.end(0) s = s[i:] if s == '': s = _('No subject') else: i = -1 # Convert some HTML entities s = cgi.escape(s, quote=True) # and finally return re.escape(s) def report(msg, verbose): if verbose: print msg.as_string() else: mid = msg['message-id'] or 'missing' subj = msg['subject'] or 'missing' date = msg['date'] or 'missing' fm = msg['from'] or 'missing' print _("""Message-ID: %(mid)s Subject: %(subj)s Date: %(date)s From: %(fm)s """) def check_msg(arch, msg, msgno, match, subject, verbose): uf = msg.get_unixfrom() if not cre.match(uf): print _('Suspicious Unix-From line in message %(msgno)d\n%(uf)s') if len(msg) == 0: print _('No headers in message %(msgno)d') report(msg, verbose) return if not msg['date']: print _('No Date: header in message %(msgno)d') report(msg, verbose) return try: md = msg['date'] msgdt = email.Utils.mktime_tz(email.Utils.parsedate_tz(md)) except (OverflowError, ValueError, TypeError): print _('Unparseable or invalid Date: %(md)s') report(msg, verbose) return if msgdt < 0: print _('Unparseable or invalid Date: %(md)s') report(msg, verbose) return if match: fname = '%06d.html' % msgno fpath = os.path.join(arch.maillist.archive_dir(), arch.dateToVolName(msgdt), fname) if not os.path.isfile(fpath): print _('Non-existent file: %(fpath)s') return if subject: xre = re.compile('

%s

' % fix_subj(msg['subject']), re.MULTILINE) if not xre.search(open(fpath).read()): print _( 'Message Subject: not found in html for message %(msgno)d') report(msg, verbose) def main(): global cset try: opts, args = getopt.getopt( sys.argv[1:], 'hnsv', ['help', 'nomatch', 'subject', 'verbose']) except getopt.error, msg: usage(1, msg) match = True subject = verbose = False for opt, arg in opts: if opt in ('-h', '--help'): usage(0) if opt in ('-n', '--nomatch'): match = False if opt in ('-s', '--subject'): subject = True if opt in ('-v', '--verbose'): verbose = True if subject and not match: usage(1,_("'-n/--nomatch' and '-s/--subject' are mutually exclusive.")) try: listname = args[0].lower() except IndexError: usage(1,_('Required listname missing.')) try: mlist = MailList.MailList(listname, lock=0) except Errors.MMUnknownListError, e: usage(2,_('No such list: %(listname)s\n%(e)s')) cset = Utils.GetCharSet(mlist.preferred_language) if len(args) > 1: mbox = args[1] else: mbox = mlist.ArchiveFileName() try: fp = open(mbox) except IOError, msg: usage(3, _('Cannot open mbox file %(mbox)s: %(msg)s')) arch = HyperArchive(mlist) msgno = 0 mb = Mailbox.Mailbox(fp) for msg in mb: check_msg(arch, msg, msgno, match, subject, verbose) msgno += 1 if __name__ == '__main__': main()