#!/usr/bin/env python
#
# Copyright (C) 2006 by the Free Software Foundation, Inc.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
# USA.
#
# 2006-04-04 mas    Initial Rev. Adapted in part from Jim Tittsler's
#                   mailman-subscribers.py script.
#                   Mark Sapiro <msapiro@value.net>
# 2006-04-18 mas    Try to get missing info from other places.
# 2006-08-21 mas    Try to deal with MIME messages.
# 2006-08-26 mas    Accept more than HTML as the second alternative for
#                   multipart/alternative.
# 2006-12-10 mas    Made unmunging of sender address conditional on
#                   having an address.

"""Retrieve list archives from lists.topica.com.

Usage: %(PROGRAM)s [options] listname user-email password

Options:
   --output file
   -o file
       Write output to specified file (required).

   --start nn
   -s nn
       Start with message nn rather than 0 (useful for testing,
       error recovery and incremental updating).

   --verbose
   -v
       Print starting message number of each retrieved index page
       to monitor progress.

   --debug
   -d
       Print copious debugging output - not recommended.

   --help
   -h
       Print this help message and exit

   listname is the name of the mailing list.
   user-email is the email address of a list subscriber with a Topica account.
   password is the list subscriber's Topica account password.

   If Python 2.4's cookielib is available, we use it.  Otherwise we require
   ClientCookie  http://wwwsearch.sourceforge.net/ClientCookie/
"""

import re
import sys
import time
import email
import getopt
import urllib
import urllib2
from HTMLParser import HTMLParser
from email.Generator import Generator

NL = '\n'
# Pattern to remove line break and continuation white space from HTML
# multi-line subject.
BR = re.compile('<BR>\s', re.I)
# Number of body lines to scan looking for MIME Content-Type: header
MIME_LINES = 10

# if we have Python 2.4's cookielib, use it
try:
    import cookielib
    policy = cookielib.DefaultCookiePolicy(rfc2965 = True)
    cookiejar = cookielib.CookieJar(policy)
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar)).open
except ImportError:
    import ClientCookie
    # if this is a new ClientCookie, we need to turn on RFC2965 cookies
    cookiejar = ClientCookie.CookieJar()
    try:
        cookiejar.set_policy(ClientCookie.DefaultCookiePolicy(rfc2965 = True))
        # install an opener that uses this policy
        opener = ClientCookie.build_opener(
                ClientCookie.HTTPCookieProcessor(cookiejar))
        ClientCookie.install_opener(opener)
    except AttributeError:
        # must be an old ClientCookie, which already accepts RFC2965 cookies
        pass
    opener = ClientCookie.urlopen

PROGRAM = sys.argv[0]

try:
    True, False
except NameError:
    True = 1
    False = 0

def usage(code, msg=''):
    if code:
        fd = sys.stderr
    else:
        fd = sys.stdout
    print >> fd, __doc__ % globals()
    if msg:
        print >> fd, msg
    sys.exit(code)

mids = []
more = True
start = '0'

class TopicaHTMLParser(HTMLParser):
    def handle_starttag(self, tag, attrs):
        global mids, more, start
        if tag == 'a':
            for a,v in attrs:
                if a == 'href' and v.find('mid=') >= 0:
                    m = re.search(r'mid=(?P<mid>\d+)', v, re.I)
                    if m:
                        mids.append(m.group('mid'))
                if a == 'href' and v.find('read?sort=d&start=') >= 0:
                    m = re.search(r'start=(?P<newstart>\d+)', v, re.I)
                    if m:
                        newstart = m.group('newstart')
                        if int(newstart) > int(start) and not more:
                            start = newstart
                            more = True

def main():
    global mids, more, start, index_url, msg_url, mid
    try:
        opts, args = getopt.getopt(sys.argv[1:], "ho:s:dv",
                ["help", "output=", "start=", "debug", "verbose"])
    except:
        usage(2)

    fp = None
    verbose = False
    debug = False
    for o,a in opts:
        if o in ("-v", "--verbose"):
            verbose = True
        if o in ("-d", "debug"):
            debug = True
        if o in ("-h", "--help"):
            usage(0)
        if o in ("-o", "--output"):
            fp = open(a, "wt")
        if o in ("-s", "--start"):
            start = a
    if not fp:
        usage(2, 'Output file required.')
    if len(args) != 3:
        usage(2)

    login_page = 'http://lists.topica.com/login.html'
    # get the login page and it's cookie
    page = opener(login_page)
    page.close()

    login_url = 'http://lists.topica.com/perl/login.pl'
    p = {'location': '',
         'al': '',
         'email': args[1],
         'password': args[2]
         }
    # login
    page = opener(login_url, urllib.urlencode(p))
    lines = page.read()
    page.close()
    if lines.find('Invalid username and/or password.') >= 0:
        usage(1, 'Invalid username and/or password.')
    if debug:
        print login_url, urllib.urlencode(p)
        print lines

    # Create a generator instance to write the messages.
    # This will escape any '^From ' lines in the body.
    gen = Generator(fp)

    # logged in, now the main loop
    while more:
        index_url = 'http://lists.topica.com/lists/%s/read' % args[0]
        p = {'sort': 'd',
             'start': start
             }
        try:
            page = opener(index_url, urllib.urlencode(p))
            lines = page.read()
            page.close()
        except urllib2.HTTPError:
            usage(1, """Topica server error. Possibly a bad listname.
If not, retry may succeed""")
        if lines.find('Sorry, we experienced an error.') >= 0:
            usage(1, """Topica error. Possibly a bad listname.
If not, retry may succeed""")
        if lines.find('You do not have access to this list.') >= 0:
            usage(1, "Topica says you don't have access to the '%s' list."
                      % args[0])
        if debug:
            print index_url, urllib.urlencode(p)
            print lines

        if verbose:
            print start
        more = False
        mids = []
        parser = TopicaHTMLParser()
        parser.feed(lines)
        parser.close()

        for mid in mids:
            msg_url = 'http://lists.topica.com/lists/%s/read/post.html' \
                      % args[0]
            p = {'mode': 'forward',
                 'mid': mid
                 }
            page = opener(msg_url, urllib.urlencode(p))
            lines = page.read()
            page.close()
            if debug:
                print msg_url, urllib.urlencode(p)
                print lines
            lines = lines.splitlines()
            # strip all but the original message
            for i in range(len(lines)):
                if lines[i] == '------ Start of Forwarded Message ------':
                    lines = lines[i+1:]
                    break
            r = range(len(lines))
            r.reverse()
            for i in r:
                if lines[i] == '------ End of Forwarded Message ------':
                    lines = lines[:i]
                    break
            # See if there are MIME parts and try to deal with them
            check_mime(lines)
            # Make an email.Message.Message object for easier header
            # manipulation
            msg = email.message_from_string(NL.join(lines) + NL)
            date = msg.get('sent')
            subj = msg.get('subject')
            sndr = msg.get('from')
            recip = msg.get('to')
            if not recip:
                del msg['to']
                msg['To'] = '%s@topica.com' % args[0]
            if not date or not subj or not sndr:
                date, subj, sndr = get_info(date, subj, sndr)
                del msg['subject']
                msg['Subject'] = subj
                del msg['from']
                msg['From'] = sndr
            del msg['sent']
            msg['Date'] = date
            name, addr = email.Utils.parseaddr(sndr)
            date = email.Utils.parsedate_tz(msg.get('date'))
            if addr == '':
                addr = '-'
            if date:
                date = time.asctime(time.gmtime(email.Utils.mktime_tz(date)))
            else:
                date = ''
            msg.set_unixfrom('From %s  %s' % (addr, date))
            # Change Content-Type: for mp/a
            fix_mpa(msg)
            gen.flatten(msg, unixfrom=True)
            gen.write(NL)

    fp.close()

def get_info(date, subj, sndr):
    global index_url, msg_url, mid
    p = {'mid': mid}
    page = opener(index_url + '/message.html', urllib.urlencode(p))
    lines = page.read().splitlines()
    page.close()
    for line in lines:
        if not subj:
            m = re.search(
                r'<FONT CLASS="headline".*COLOR="#990099"><B>(?P<subj>.*)</B></FONT>',
                line, re.I)
            if m:
                subj = re.sub(BR, '', m.group('subj'))
        if not date:
            m = re.search(
                r'<FONT.*SIZE="-2">&nbsp;<NOBR>(?P<date>.*)&nbsp;</NOBR></FONT>',
                line, re.I)
            if m:
                date = m.group('date')
        if not sndr:
            m = re.search(
                r'<FONT.*<A.*onClick=.*?mode=replytosender&mid=.*true">(?P<sndr>.*)</A></FONT>',
                line, re.I)
            if m:
                sndr = m.group('sndr')
                # sndr is now either a real name or an address, never both :-(
                # if it's an address, it's munged so try to get it unmunged
                if sndr.find('@') >= 0:
                    p = {'mid': mid,
                         'mode': 'reply'}
                    page = opener(msg_url, urllib.urlencode(p))
                    lines2 = page.read().splitlines()
                    page.close()
                    for line2 in lines2:
                        m = re.search(r'^(?P<sndr>[^ @]+@[^ @]+) wrote:$',
                            line2, re.I)
                        if m:
                            sndr = m.group('sndr')
                            break
        if subj and sndr and date:
            break
      
    return ((date, subj, sndr))

def check_mime(lines):
    # If the body has a Content-Type: header preceded by a boundary,
    # make the outer message multipart.
    # Find the body
    for i in range(len(lines)):
        if lines[i] == '':
            break
    # Now look for Content-Type:
    for j in range(MIME_LINES):
        if i + j + 1 >= len(lines):
            return
        found = False
        if re.search('(?i)^content-type:', lines[i+j+1]):
            found = True
            break
    if not found:
        return
    # Now find the boundary.
    found = False
    while j > 0:
        if lines[i+j].startswith('--'):
            found = True
            break
        j -= 1
    if not found:
        # No boundary - punt.
        return
    boundary = lines[i+j][2:]
    # insert MIME headers, Call it mp/mixed for now, fix mp/a later
    lines.insert(i, 'Content-Type: multipart/mixed;\n boundary="%s"' % boundary)
    lines.insert(i, 'MIME-Version: 1.0')

def fix_mpa(msg):
    # If the message consists of exactly a text/plain part followed by a
    # second fancier text part, call it multipart/alternative.
    if msg.is_multipart() and len(msg.get_payload()) == 2 and \
      msg.get_payload()[0].get_content_type() == 'text/plain' and \
      msg.get_payload()[1].get_content_maintype() == 'text' and \
      msg.get_payload()[1].get_content_subtype() in ('html','enriched',
      'richtext', 'rtf'):
        msg.set_type('multipart/alternative')

if __name__ == '__main__':
    main()