#!/usr/bin/env python # # Copyright (C) 2006 by the Free Software Foundation, Inc. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, # USA. # # 2006-04-04 mas Initial Rev. Adapted in part from Jim Tittsler's # mailman-subscribers.py script. # Mark Sapiro # 2006-04-18 mas Try to get missing info from other places. # 2006-08-21 mas Try to deal with MIME messages. # 2006-08-26 mas Accept more than HTML as the second alternative for # multipart/alternative. # 2006-12-10 mas Made unmunging of sender address conditional on # having an address. """Retrieve list archives from lists.topica.com. Usage: %(PROGRAM)s [options] listname user-email password Options: --output file -o file Write output to specified file (required). --start nn -s nn Start with message nn rather than 0 (useful for testing, error recovery and incremental updating). --verbose -v Print starting message number of each retrieved index page to monitor progress. --debug -d Print copious debugging output - not recommended. --help -h Print this help message and exit listname is the name of the mailing list. user-email is the email address of a list subscriber with a Topica account. password is the list subscriber's Topica account password. If Python 2.4's cookielib is available, we use it. Otherwise we require ClientCookie http://wwwsearch.sourceforge.net/ClientCookie/ """ import re import sys import time import email import getopt import urllib import urllib2 from HTMLParser import HTMLParser from email.Generator import Generator NL = '\n' # Pattern to remove line break and continuation white space from HTML # multi-line subject. BR = re.compile('
\s', re.I) # Number of body lines to scan looking for MIME Content-Type: header MIME_LINES = 10 # if we have Python 2.4's cookielib, use it try: import cookielib policy = cookielib.DefaultCookiePolicy(rfc2965 = True) cookiejar = cookielib.CookieJar(policy) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar)).open except ImportError: import ClientCookie # if this is a new ClientCookie, we need to turn on RFC2965 cookies cookiejar = ClientCookie.CookieJar() try: cookiejar.set_policy(ClientCookie.DefaultCookiePolicy(rfc2965 = True)) # install an opener that uses this policy opener = ClientCookie.build_opener( ClientCookie.HTTPCookieProcessor(cookiejar)) ClientCookie.install_opener(opener) except AttributeError: # must be an old ClientCookie, which already accepts RFC2965 cookies pass opener = ClientCookie.urlopen PROGRAM = sys.argv[0] try: True, False except NameError: True = 1 False = 0 def usage(code, msg=''): if code: fd = sys.stderr else: fd = sys.stdout print >> fd, __doc__ % globals() if msg: print >> fd, msg sys.exit(code) mids = [] more = True start = '0' class TopicaHTMLParser(HTMLParser): def handle_starttag(self, tag, attrs): global mids, more, start if tag == 'a': for a,v in attrs: if a == 'href' and v.find('mid=') >= 0: m = re.search(r'mid=(?P\d+)', v, re.I) if m: mids.append(m.group('mid')) if a == 'href' and v.find('read?sort=d&start=') >= 0: m = re.search(r'start=(?P\d+)', v, re.I) if m: newstart = m.group('newstart') if int(newstart) > int(start) and not more: start = newstart more = True def main(): global mids, more, start, index_url, msg_url, mid try: opts, args = getopt.getopt(sys.argv[1:], "ho:s:dv", ["help", "output=", "start=", "debug", "verbose"]) except: usage(2) fp = None verbose = False debug = False for o,a in opts: if o in ("-v", "--verbose"): verbose = True if o in ("-d", "debug"): debug = True if o in ("-h", "--help"): usage(0) if o in ("-o", "--output"): fp = open(a, "wt") if o in ("-s", "--start"): start = a if not fp: usage(2, 'Output file required.') if len(args) != 3: usage(2) login_page = 'http://lists.topica.com/login.html' # get the login page and it's cookie page = opener(login_page) page.close() login_url = 'http://lists.topica.com/perl/login.pl' p = {'location': '', 'al': '', 'email': args[1], 'password': args[2] } # login page = opener(login_url, urllib.urlencode(p)) lines = page.read() page.close() if lines.find('Invalid username and/or password.') >= 0: usage(1, 'Invalid username and/or password.') if debug: print login_url, urllib.urlencode(p) print lines # Create a generator instance to write the messages. # This will escape any '^From ' lines in the body. gen = Generator(fp) # logged in, now the main loop while more: index_url = 'http://lists.topica.com/lists/%s/read' % args[0] p = {'sort': 'd', 'start': start } try: page = opener(index_url, urllib.urlencode(p)) lines = page.read() page.close() except urllib2.HTTPError: usage(1, """Topica server error. Possibly a bad listname. If not, retry may succeed""") if lines.find('Sorry, we experienced an error.') >= 0: usage(1, """Topica error. Possibly a bad listname. If not, retry may succeed""") if lines.find('You do not have access to this list.') >= 0: usage(1, "Topica says you don't have access to the '%s' list." % args[0]) if debug: print index_url, urllib.urlencode(p) print lines if verbose: print start more = False mids = [] parser = TopicaHTMLParser() parser.feed(lines) parser.close() for mid in mids: msg_url = 'http://lists.topica.com/lists/%s/read/post.html' \ % args[0] p = {'mode': 'forward', 'mid': mid } page = opener(msg_url, urllib.urlencode(p)) lines = page.read() page.close() if debug: print msg_url, urllib.urlencode(p) print lines lines = lines.splitlines() # strip all but the original message for i in range(len(lines)): if lines[i] == '------ Start of Forwarded Message ------': lines = lines[i+1:] break r = range(len(lines)) r.reverse() for i in r: if lines[i] == '------ End of Forwarded Message ------': lines = lines[:i] break # See if there are MIME parts and try to deal with them check_mime(lines) # Make an email.Message.Message object for easier header # manipulation msg = email.message_from_string(NL.join(lines) + NL) date = msg.get('sent') subj = msg.get('subject') sndr = msg.get('from') recip = msg.get('to') if not recip: del msg['to'] msg['To'] = '%s@topica.com' % args[0] if not date or not subj or not sndr: date, subj, sndr = get_info(date, subj, sndr) del msg['subject'] msg['Subject'] = subj del msg['from'] msg['From'] = sndr del msg['sent'] msg['Date'] = date name, addr = email.Utils.parseaddr(sndr) date = email.Utils.parsedate_tz(msg.get('date')) if addr == '': addr = '-' if date: date = time.asctime(time.gmtime(email.Utils.mktime_tz(date))) else: date = '' msg.set_unixfrom('From %s %s' % (addr, date)) # Change Content-Type: for mp/a fix_mpa(msg) gen.flatten(msg, unixfrom=True) gen.write(NL) fp.close() def get_info(date, subj, sndr): global index_url, msg_url, mid p = {'mid': mid} page = opener(index_url + '/message.html', urllib.urlencode(p)) lines = page.read().splitlines() page.close() for line in lines: if not subj: m = re.search( r'(?P.*)', line, re.I) if m: subj = re.sub(BR, '', m.group('subj')) if not date: m = re.search( r' (?P.*) ', line, re.I) if m: date = m.group('date') if not sndr: m = re.search( r'(?P.*)', line, re.I) if m: sndr = m.group('sndr') # sndr is now either a real name or an address, never both :-( # if it's an address, it's munged so try to get it unmunged if sndr.find('@') >= 0: p = {'mid': mid, 'mode': 'reply'} page = opener(msg_url, urllib.urlencode(p)) lines2 = page.read().splitlines() page.close() for line2 in lines2: m = re.search(r'^(?P[^ @]+@[^ @]+) wrote:$', line2, re.I) if m: sndr = m.group('sndr') break if subj and sndr and date: break return ((date, subj, sndr)) def check_mime(lines): # If the body has a Content-Type: header preceded by a boundary, # make the outer message multipart. # Find the body for i in range(len(lines)): if lines[i] == '': break # Now look for Content-Type: for j in range(MIME_LINES): if i + j + 1 >= len(lines): return found = False if re.search('(?i)^content-type:', lines[i+j+1]): found = True break if not found: return # Now find the boundary. found = False while j > 0: if lines[i+j].startswith('--'): found = True break j -= 1 if not found: # No boundary - punt. return boundary = lines[i+j][2:] # insert MIME headers, Call it mp/mixed for now, fix mp/a later lines.insert(i, 'Content-Type: multipart/mixed;\n boundary="%s"' % boundary) lines.insert(i, 'MIME-Version: 1.0') def fix_mpa(msg): # If the message consists of exactly a text/plain part followed by a # second fancier text part, call it multipart/alternative. if msg.is_multipart() and len(msg.get_payload()) == 2 and \ msg.get_payload()[0].get_content_type() == 'text/plain' and \ msg.get_payload()[1].get_content_maintype() == 'text' and \ msg.get_payload()[1].get_content_subtype() in ('html','enriched', 'richtext', 'rtf'): msg.set_type('multipart/alternative') if __name__ == '__main__': main()