# Copyright (C) 2012-2013 by the Free Software Foundation, Inc.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
# USA.
"""Handler to hold posts with no subject and posts which appear to
quote digests.

THRESHOLD = 4 works well for detecting digest quoting.

There is also code in this Handler to attempt to detect and hold posts
which quote excessively. This is experimental and imperfect at this point.

If you want to use this code, you will need to experiment to find an
acceptable value for RATIO, MINQUOTE and maybe MAXQUOTE. Set RATIO = 0
to disable this hold.

You can also set DIGESTABLE_ONLY = True to only apply this test to lists for
which digestable = Yes.

And, you can set REJECT_QUOTES = True to reject instead of hold messages with
excessive quoting. If you do this, you need to add an appropriate
rejectquote.txt template to the appropriate places in your templates directory.

Some statistics are collected in the message metadata in a decoration-data
supplemental dictionary for Decorate.py so that they may be reported via
msg_hdr or msg_footer.
"""
import re

from Mailman import i18n
from Mailman import Utils
from Mailman import Errors
from Mailman import MailList

from Mailman.Handlers.Hold import hold_for_approval
from Mailman.Logging.Syslog import syslog

# Minimum number of non-blank digest masthead lines found in message
# to cause message to be held.
THRESHHOLD = 4

# Ratio of quoted to non-blank unquoted text above which message is held.
RATIO = 0

# String containing all the characters any one of which is used to prefix
# a quoted line.
QUOTE_CHARS = '>:|'

# Minimum size of quoted material. If quoted material not greater than this,
# hold is not applied.
MINQUOTE = 500

# If MAXQUOTE is greater than MINQUOTE, RATIO applies if the quoted material
# is greater than MAXQUOTE. For MINQUOTE < quoted <= MAXQUOTE, the hold is
# applied if (unquoted * RATIO) is less than
# MAXQUOTE * (quoted - MINQUOTE) / (MAXQUOTE - MINQUOTE)
MAXQUOTE = 2000

# Flag to apply excessive quoting tests only to lists for which digestable is
# true.
DIGESTABLE_ONLY = True

# Flag to only apply excessive quoting tests to messages which contain the
# subject_prefix for this or possibly other lists.
# 0 -> don't test for subject_prefix
# 1 -> only test quoting if Subject contains this list's subject_prefix
# 2 -> only test quoting if Subject contains subject_prefix of any list in
#      the installation.
TEST_SUBJECT = 2

# Flag to reject instead of hold messages that appear to quote excessively.
REJECT_QUOTES = True

# Re to recognize no subject:
NSRE = re.compile(r'^\s*(re:|aw:|fwd:)?\s*(\(no subject\))?\s*$', re.IGNORECASE)

# Re to recognize a digest subject:
DIGRE = re.compile(' Digest, Vol \d+, Issue \d+$', re.IGNORECASE)

# Re to recognize a Yahoo reply quote
YAHOORE = re.compile(r'\n(_{32}|--- On [^\n]* wrote: *\n *)\n ?From:.*', re.DOTALL)

# Re to recognize an AOL reply quote
AOLRE = re.compile(r'\nIn a message dated[^\n]*, *\n[^\n]* writes: *\n.*', re.DOTALL)

# Re to recognize an AOL Webmail or Comcast or Outlook? reply quote
AOLWRE = re.compile(r'\n----- ?Original Message ?----- *\nFrom: [^\n]*\n(To: [^\n]*\n(Cc: [^\n]*\n)?Sent: [^\n]*\n|Sent: [^\n]*\nTo: [^\n]*\n(Cc: [^\n]*\n)?)Subject: [^\n]*\n *\n.*', re.DOTALL)

def _(s):
    return s

class MessageHasNoSubject(Errors.HoldMessage):
    reason = _('Message has no Subject')
    rejection = _('Posts to this list must have a non-empty Subject.')

class MessageQuotesDigest(Errors.HoldMessage):
    reason = _('Message quotes digest boilerplate')
    rejection = _("""This message appears to quote a digest.
Please remove excessive or irrelevant quoting.""")

class MessageQuotesExcessively(Errors.HoldMessage):
    reason = _('Message has excessive quoting')
    rejection = _("""This message appears to have much more quoted than
original content.  Please quote only as much as is
required to establish the context of your reply.""")

class MessageHasDigestSubject(Errors.HoldMessage):
    reason = _('Message has a Digest subject')
    rejection = _("""This message has a Digest subject.
Please provide a meaningful subject.""")

_ = i18n._

def process(mlist, msg, msgdata):
    # Initialize our quoting statistics dictionary so it's all there for
    # Decorate.py
    body_size = 0
    for part in msg.walk():
        if part.is_multipart():
            continue
        body_size += len(part.get_payload(decode=True))
    msgdata['decoration-data'] = {'quoted_count': 0,
                                  'unquoted_count': body_size,
                                 }
    if msgdata.get('approved'):
        return
    # Does this message have a Subject?
    if NSRE.search(msg.get('subject', '')):
        hold_for_approval(mlist, msg, msgdata, MessageHasNoSubject)

    # Does this message have a digest subject?
    if DIGRE.search(msg.get('subject', '').strip()):
        hold_for_approval(mlist, msg, msgdata, MessageHasDigestSubject)

    # Is there digest boilerplate in this message?
    # Get the masthead, but without emails and URLs.
    mastheadtxt = Utils.maketext(
        'masthead.txt',
        {'real_name' :        mlist.real_name,
         'got_list_email':    '',
         'got_listinfo_url':  '',
         'got_request_email': '',
         'got_owner_email':   '',
         }, mlist=mlist)
    msgtext = ''
    for part in msg.walk():
        if part.get_content_maintype() == 'text':
            msgtext += part.get_payload(decode=True)
    matches = 0
    lines = mastheadtxt.splitlines()
    for line in lines:
        line = line.strip()
        if not line:
            continue
        if msgtext.find(line) >= 0:
            matches += 1
    if matches >= THRESHHOLD:
        hold_for_approval(mlist, msg, msgdata, MessageQuotesDigest)

    # Try to get an unquoted/quoted ratio. Look only at text/plain parts.
    # HTML can represent quoted text in too many complicated ways, and
    # many lists will have removed HTML by now anyway.
    if TEST_SUBJECT:
        # Get the Subject as a Unicode. It may have unencoded non-ascii.
        s = Utils.oneline(msg['subject'], 'us-ascii').decode('us-ascii',
                                                             'replace')
    if TEST_SUBJECT == 1 and s.find(mlist.subject_prefix.strip()) < 0:
        return
    # Defer TEST_SUBJECT == 2 until later. It's expensive to instantiate
    # all the lists.
    global quoted, unquoted
    if DIGESTABLE_ONLY and not mlist.digestable:
        return
    quoted = unquoted = 0
    first_part = True
    mailer = msg.get('x-mailer', '')
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            cont = True
            msgtext = part.get_payload(decode=True)
            if mailer.startswith('Yahoo') and first_part:
                first_part = False
                cont = do_domain(YAHOORE, msgtext)
            elif mailer.startswith('AOL Webmail') and first_part:
                first_part = False
                cont = do_domain(AOLWRE, msgtext)
            elif mailer.startswith('AOL') and first_part:
                first_part = False
                cont = do_domain(AOLRE, msgtext)
            elif mailer.startswith('Zimbra') and first_part:
                first_part = False
                cont = do_domain(AOLWRE, msgtext)
            elif mailer.startswith('Microsoft Outlook') and first_part:
                first_part = False
                cont = do_domain(AOLWRE, msgtext)
            if cont:
                for line in msgtext.splitlines():
                    line = line.strip()
                    if line and line[:1] in QUOTE_CHARS:
                        quoted += len(line)
                    else:
                        unquoted += len(line)
    # Update the decoration-data dictionary
    msgdata['decoration-data']['quoted_count'] = quoted
    msgdata['decoration-data']['unquoted_count'] = unquoted
    if RATIO > 0:
        if (quoted > MINQUOTE and quoted > MAXQUOTE
                and quoted > RATIO * unquoted) or (quoted > MINQUOTE and
                quoted <= MAXQUOTE and
                MAXQUOTE * (quoted - MINQUOTE) >
                unquoted * RATIO * (MAXQUOTE - MINQUOTE)):
            if TEST_SUBJECT == 2:
                found = False
                for ln in Utils.list_names():
                    if ln == mlist.internal_name():
                        l = mlist
                    else:
                        l = MailList.MailList(ln, lock=False)
                    if s.find(l.subject_prefix.strip()) >= 0:
                        found = True
                        break
                if not found:
                    return
            if REJECT_QUOTES:
                rmsg = Utils.maketext('rejectquote.txt',
                                      dict={'listname': mlist.real_name,
                                            'minquote': MINQUOTE,
                                            'maxquote': MAXQUOTE,
                                            'ratio': RATIO,
                                            'quoted': quoted,
                                            'unquoted': unquoted,
                                           },
                                      mlist=mlist
                                     )
                syslog('vette',
                       'Rejected excessive quote:\n' + msg.as_string())
                raise Errors.RejectMessage, rmsg
            else:
                hold_for_approval(mlist, msg, msgdata, MessageQuotesExcessively)

def do_domain(cre, msgtext):
    global quoted, unquoted
    mo = cre.search(msgtext)
    if not mo:
        return True
    else:
        quoted += len(mo.group(0))
        unquoted += len(msgtext) - quoted
        return False