# Copyright (C) 2012-2013 by the Free Software Foundation, Inc. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, # USA. """Handler to hold posts with no subject and posts which appear to quote digests. THRESHOLD = 4 works well for detecting digest quoting. There is also code in this Handler to attempt to detect and hold posts which quote excessively. This is experimental and imperfect at this point. If you want to use this code, you will need to experiment to find an acceptable value for RATIO, MINQUOTE and maybe MAXQUOTE. Set RATIO = 0 to disable this hold. You can also set DIGESTABLE_ONLY = True to only apply this test to lists for which digestable = Yes. And, you can set REJECT_QUOTES = True to reject instead of hold messages with excessive quoting. If you do this, you need to add an appropriate rejectquote.txt template to the appropriate places in your templates directory. Some statistics are collected in the message metadata in a decoration-data supplemental dictionary for Decorate.py so that they may be reported via msg_hdr or msg_footer. """ import re from Mailman import i18n from Mailman import Utils from Mailman import Errors from Mailman import MailList from Mailman.Handlers.Hold import hold_for_approval from Mailman.Logging.Syslog import syslog # Minimum number of non-blank digest masthead lines found in message # to cause message to be held. THRESHHOLD = 4 # Ratio of quoted to non-blank unquoted text above which message is held. RATIO = 0 # String containing all the characters any one of which is used to prefix # a quoted line. QUOTE_CHARS = '>:|' # Minimum size of quoted material. If quoted material not greater than this, # hold is not applied. MINQUOTE = 500 # If MAXQUOTE is greater than MINQUOTE, RATIO applies if the quoted material # is greater than MAXQUOTE. For MINQUOTE < quoted <= MAXQUOTE, the hold is # applied if (unquoted * RATIO) is less than # MAXQUOTE * (quoted - MINQUOTE) / (MAXQUOTE - MINQUOTE) MAXQUOTE = 2000 # Flag to apply excessive quoting tests only to lists for which digestable is # true. DIGESTABLE_ONLY = True # Flag to only apply excessive quoting tests to messages which contain the # subject_prefix for this or possibly other lists. # 0 -> don't test for subject_prefix # 1 -> only test quoting if Subject contains this list's subject_prefix # 2 -> only test quoting if Subject contains subject_prefix of any list in # the installation. TEST_SUBJECT = 2 # Flag to reject instead of hold messages that appear to quote excessively. REJECT_QUOTES = True # Re to recognize no subject: NSRE = re.compile(r'^\s*(re:|aw:|fwd:)?\s*(\(no subject\))?\s*$', re.IGNORECASE) # Re to recognize a digest subject: DIGRE = re.compile(' Digest, Vol \d+, Issue \d+$', re.IGNORECASE) # Re to recognize a Yahoo reply quote YAHOORE = re.compile(r'\n(_{32}|--- On [^\n]* wrote: *\n *)\n ?From:.*', re.DOTALL) # Re to recognize an AOL reply quote AOLRE = re.compile(r'\nIn a message dated[^\n]*, *\n[^\n]* writes: *\n.*', re.DOTALL) # Re to recognize an AOL Webmail or Comcast or Outlook? reply quote AOLWRE = re.compile(r'\n----- ?Original Message ?----- *\nFrom: [^\n]*\n(To: [^\n]*\n(Cc: [^\n]*\n)?Sent: [^\n]*\n|Sent: [^\n]*\nTo: [^\n]*\n(Cc: [^\n]*\n)?)Subject: [^\n]*\n *\n.*', re.DOTALL) def _(s): return s class MessageHasNoSubject(Errors.HoldMessage): reason = _('Message has no Subject') rejection = _('Posts to this list must have a non-empty Subject.') class MessageQuotesDigest(Errors.HoldMessage): reason = _('Message quotes digest boilerplate') rejection = _("""This message appears to quote a digest. Please remove excessive or irrelevant quoting.""") class MessageQuotesExcessively(Errors.HoldMessage): reason = _('Message has excessive quoting') rejection = _("""This message appears to have much more quoted than original content. Please quote only as much as is required to establish the context of your reply.""") class MessageHasDigestSubject(Errors.HoldMessage): reason = _('Message has a Digest subject') rejection = _("""This message has a Digest subject. Please provide a meaningful subject.""") _ = i18n._ def process(mlist, msg, msgdata): # Initialize our quoting statistics dictionary so it's all there for # Decorate.py body_size = 0 for part in msg.walk(): if part.is_multipart(): continue body_size += len(part.get_payload(decode=True)) msgdata['decoration-data'] = {'quoted_count': 0, 'unquoted_count': body_size, } if msgdata.get('approved'): return # Does this message have a Subject? if NSRE.search(msg.get('subject', '')): hold_for_approval(mlist, msg, msgdata, MessageHasNoSubject) # Does this message have a digest subject? if DIGRE.search(msg.get('subject', '').strip()): hold_for_approval(mlist, msg, msgdata, MessageHasDigestSubject) # Is there digest boilerplate in this message? # Get the masthead, but without emails and URLs. mastheadtxt = Utils.maketext( 'masthead.txt', {'real_name' : mlist.real_name, 'got_list_email': '', 'got_listinfo_url': '', 'got_request_email': '', 'got_owner_email': '', }, mlist=mlist) msgtext = '' for part in msg.walk(): if part.get_content_maintype() == 'text': msgtext += part.get_payload(decode=True) matches = 0 lines = mastheadtxt.splitlines() for line in lines: line = line.strip() if not line: continue if msgtext.find(line) >= 0: matches += 1 if matches >= THRESHHOLD: hold_for_approval(mlist, msg, msgdata, MessageQuotesDigest) # Try to get an unquoted/quoted ratio. Look only at text/plain parts. # HTML can represent quoted text in too many complicated ways, and # many lists will have removed HTML by now anyway. if TEST_SUBJECT: # Get the Subject as a Unicode. It may have unencoded non-ascii. s = Utils.oneline(msg['subject'], 'us-ascii').decode('us-ascii', 'replace') if TEST_SUBJECT == 1 and s.find(mlist.subject_prefix.strip()) < 0: return # Defer TEST_SUBJECT == 2 until later. It's expensive to instantiate # all the lists. global quoted, unquoted if DIGESTABLE_ONLY and not mlist.digestable: return quoted = unquoted = 0 first_part = True mailer = msg.get('x-mailer', '') for part in msg.walk(): if part.get_content_type() == 'text/plain': cont = True msgtext = part.get_payload(decode=True) if mailer.startswith('Yahoo') and first_part: first_part = False cont = do_domain(YAHOORE, msgtext) elif mailer.startswith('AOL Webmail') and first_part: first_part = False cont = do_domain(AOLWRE, msgtext) elif mailer.startswith('AOL') and first_part: first_part = False cont = do_domain(AOLRE, msgtext) elif mailer.startswith('Zimbra') and first_part: first_part = False cont = do_domain(AOLWRE, msgtext) elif mailer.startswith('Microsoft Outlook') and first_part: first_part = False cont = do_domain(AOLWRE, msgtext) if cont: for line in msgtext.splitlines(): line = line.strip() if line and line[:1] in QUOTE_CHARS: quoted += len(line) else: unquoted += len(line) # Update the decoration-data dictionary msgdata['decoration-data']['quoted_count'] = quoted msgdata['decoration-data']['unquoted_count'] = unquoted if RATIO > 0: if (quoted > MINQUOTE and quoted > MAXQUOTE and quoted > RATIO * unquoted) or (quoted > MINQUOTE and quoted <= MAXQUOTE and MAXQUOTE * (quoted - MINQUOTE) > unquoted * RATIO * (MAXQUOTE - MINQUOTE)): if TEST_SUBJECT == 2: found = False for ln in Utils.list_names(): if ln == mlist.internal_name(): l = mlist else: l = MailList.MailList(ln, lock=False) if s.find(l.subject_prefix.strip()) >= 0: found = True break if not found: return if REJECT_QUOTES: rmsg = Utils.maketext('rejectquote.txt', dict={'listname': mlist.real_name, 'minquote': MINQUOTE, 'maxquote': MAXQUOTE, 'ratio': RATIO, 'quoted': quoted, 'unquoted': unquoted, }, mlist=mlist ) syslog('vette', 'Rejected excessive quote:\n' + msg.as_string()) raise Errors.RejectMessage, rmsg else: hold_for_approval(mlist, msg, msgdata, MessageQuotesExcessively) def do_domain(cre, msgtext): global quoted, unquoted mo = cre.search(msgtext) if not mo: return True else: quoted += len(mo.group(0)) unquoted += len(msgtext) - quoted return False