# coding=utf-8

# Encyclopaedia Britannica 2008 Ultimate DVD to Mediawiki converter
# (C) 2008 Ulrich Hecht
# released under the terms of the GNU Public License v2

ebdir = '/mnt'

import struct
import gzip
import os
import re
import urllib
import htmlentitydefs
import time
import tempfile
import pickle
import sys

articletitle = {}
articlel = {}
at_bysize = []
indextitle = {}
indexl = {}
yeartitle = {}
yearl = {}
imgtype = {}

alltitles = set()

# articles that we don't want to link to as they rarely
# fit the context of the articles linked from
wiki_blacklist = [
'recognition','acceptance','range','Nation, The','spirit',
'dominant','period','sense','function','phase','complex',
'transmission','force','couple','basis','settle','chief',
'Richard','William','private','James','Young','talent',
'suggestion','power','sequence','theme','group','action',
'second','general','order','level','interval','stock','event',
'measure','activity','cycle','analysis','development',
'rolling','conte','count','reason','interest','negative',
'stress','matter','scale','number','check','limit','medium',
'staff','minor','opera','cross','hundred','ultra','spiritual',
'ground','flour','minimum','entail','representation','Easter',
'intention','custom','thing','spell','policy','villa','zhong',
'light','match','swift','lighter','spring','Ferdinand',
'position','capture','house','parallel','thermal','degree',
'domain','agent','distribution','break','figure','lateral',
'Albert','Elizabeth','suite','round','Henry','quart','Davis',
'David','George','Edward','Felix','Harrison','Edwin', 'takin',
'Forbes','Howard','comma','matte','Robert','Roland','convent',
'Samuel','Grant','Lawrence','Mitchell','Edmund','Muhammad',
'Elijah','Alfred','Joseph','Rashi','raisin','Minim','Adrian',
'Nicholas','Hassan','Morgan','Florian','Diana','Helen',
'Edgar','Roswell','Desmond','Gabriel','catch','Jackson',
'Jacob','Norman','Philip','Michael','Peter','Adolf','Louis',
'twist','transit','Charles','Eberhard','Sebastian','Pierre',
'Hamilton','brief','Brigit','launch','capital','front','colon',
'Cameron','Blair','Craven','Academy','Armstrong','Prince',
'Queen','Bryan','Frank','Russell','Lindsey','Douglas','western',
'Benjamin','Pushkin','Barre','John','state','region','family',
'square','Kennet',
]

def unpack(string):
  t = tempfile.NamedTemporaryFile()
  t.write(string)
  t.file.flush()
  ustr = gzip.open(t.name).read()
  return ustr

def cleanandlink(article, mytitle, postproc = True):
  import HTMLParser
  class BritParser(HTMLParser.HTMLParser):
    squelch = ['script', 'title', 'style']	# tags the entire contents of which should always be scrapped
    ignore = ['html', 'head', 'meta']	# tags that don't have to be handled
    passthrough = ['table', 'tr', 'sup', 'sub', 'u', 'center', 'small', 'strong', 'em', 'cite']	# tags to be passed through verbatim
    dontreport = ['a', 'span', 'scripts', 'base', 'link', 'stylesheets', 'body']	# tags that we do not want to hear about if they cannot be handled

    def __init__(self):
      self.skip = 0	# ignore most data while > 0
      self.level = {}	# nesting level for each tag
      self.emptytag = False	# set to true if there is text inside the tag
      self.lastdata = ''	# temp storage for text while building a link
      self.state = 'dump'	# either 'dump' (write data) or 'link' (save data until link is done)
      self.out = ''	# final output
      self.linkref = None	# href attribute for a tags
      self.scrapul = False	# inside a caption list that has to be scrapped
      self.scrapclear = False	# skip "clear" div (used for formatting)
      self.paragraphed = True	# remember if we already wrote a newline to prevent adding too many
      self.inlist = False		# inside a list
      self.listtype = 'ul'
      self.spanoff = None		# what to write to end the current span
      self.italicsdepth = 0	# nested italics depth
      self.bolddepth = 0		# nested bold text depth
      self.lastquote = 0	# either 2 (italics) or 3 (bold)
      self.infont = 0
      self.intd = False
      HTMLParser.HTMLParser.__init__(self)
    
    # is attribute attr in attrs? (optional: and does it have value value?)
    def hasattr(self, attrs, attr, value = None):
      #print 'attrs',attrs,'attr',attr,'value',value
      for i in attrs:
        if i[0] == attr:
          if i[1] == value or value == None: return True
          else: return False
      return False
    
    # returns value for attribute attr
    def getattr(self, attrs, attr):
      for i in attrs:
        if i[0] == attr: return i[1]
      return None
    
    def doparagraph(self):
      if not self.paragraphed:
        if not self.out.endswith('\n'): self.out += '\n'
        self.out += '\n'
        self.paragraphed = True
      
    def handle_starttag(self, tag, attrs):
      # count tag nesting depth
      if not self.level.has_key(tag): self.level[tag] = 1
      else: self.level[tag] += 1
      
      # nothing inside the tag so far (it just started)
      self.emptytag = True
      
      if tag in self.squelch:
        self.skip += 1
        return
      
      if tag in self.ignore:
        return
      
      # 'clear' divs are used for formatting and don't contain anything useful
      if tag == 'div' and self.hasattr(attrs, 'class', 'clear'):
        self.skip += 1
        self.scrapclear = True
        return
      
      # this is where most articles start. if this div is present, discard
      # everything that came before it
      if tag == 'div' and self.hasattr(attrs, 'id', 'articleBody'):
        self.out = ''
        return

      # this is where usable content ends
      # FIXME: scraps the citation info
      if tag == 'a' and self.hasattr(attrs, 'onclick', 'scrollToTop()'):
        self.skip += 1
        return

      # image captions; redundant as they contain the same text as the
      # img alt tag, which we use instead -> scrap
      if tag == 'ul' and self.hasattr(attrs, 'class', 'captions'):
        self.skip += 1
        self.scrapul = True
        return

      # all the tags handled after this do not influence self.skip, so
      # we don't have to handle them when in skip mode
      if self.skip > 0: return
      
      # lists, mostly used in index entries
      if tag == 'ul' or tag == 'ol':
        self.inlist = True
        self.listtype = tag
        if not self.paragraphed:
          self.out += '\n\n'
          self.paragraphed = True
        return
      
      # list item
      if tag == 'li':
        # mediawiki gets confused by empty lines inside lists -> scrap them
        # FIXME: isn't this redundant? the rstrip() below does this as well
        while self.out.endswith('\n\n'):
          self.out = self.out[:-1]
        if self.level[self.listtype] != 1: print 'listlevel',self.level[self.listtype]
        if self.level[self.listtype] < 1: self.level[self.listtype] = 1
        if self.listtype == 'ul': listchar = '*'
        else: listchar = '#'
        self.out = self.out.rstrip() + '\n' + listchar * self.level[self.listtype] + ' '
        return
      
      # links to other articles
      if tag == 'a' and self.hasattr(attrs, 'href'):
        self.linkref = self.getattr(attrs, 'href')
        # ignore funky links
        if (not self.linkref.startswith('ebcid')) or 'assemblyId' in self.linkref: self.linkref = None
        else:
          # link mode: do not add data, save it until the closing tag
          self.state = 'link'
          self.lastdata = ''
          return
      
      # paragraphs
      if tag == 'p' or tag == 'div':
        if not self.paragraphed and not self.inlist:
          if not self.out.endswith('\n'): self.out += '\n'
          self.out += '\n'
          self.paragraphed = True
        return
      if tag == 'br':
        self.doparagraph()
        return
      
      if tag == 'hr':
        if not self.out.endswith('\n'): self.out += '\n'
        self.out += '----\n'
        return
      
      # images
      if tag == 'img':
        self.linkref = self.getattr(attrs, 'src')
        if 'ebcid' in self.linkref: print self.linkref
        id = self.getid('binaryId')	 # image number
        alt = self.getattr(attrs, 'alt') # image caption
        try: type = imgtype[id]		 # image file type (usually gif or jpg)
        except KeyError: type = 'img'
        if 'InlineBinaryIdentifier' in self.linkref:
          self.out += '[[Image:' + str(id) + '.' + type + ']]\n\n'
        else:
          self.out += '[[Image:' + str(id) + '.' + type + '|thumb'
          if alt: self.out += '|' + alt + ']]\n\n'
          else: self.out += ']]\n\n'
        self.paragraphed = True
        self.linkref = None
        return
      
      # italics
      if tag == 'i' or (tag == 'span' and self.hasattr(attrs, 'style') and 'italic' in self.getattr(attrs, 'style')):
        self.italicsdepth += 1
        if self.italicsdepth == 1:
          self.spanoff = "''"
          #self.out += "''"
          if self.out.endswith("''") and self.lastquote == 2:
            self.out = self.out[:-2]
            #self.out += 'x'
          else:
            self.handle_data("''")
            self.lastquote = 2
        return
      
      # bold
      # FIXME: is there any bold text using spans?
      if tag == 'b':
        self.bolddepth += 1
        if self.bolddepth == 1:
          self.spanoff = "'''"
          #self.out += "'''"
          if self.out.endswith("'''") and self.lastquote == 3:
            self.out = self.out[:-3]
            #self.out += 'y'
          else:
            self.handle_data("'''")
            self.lastquote = 3
        return
      
      # any other span we don't handle, so no need to do anything on closing tag
      if tag == 'span': self.spanoff = None
      
      # headings
      if tag[0] == 'h':
        try:
          depth = int(tag[1])
          self.doparagraph()
          if not self.out.endswith('\n'):
            # even if a (logical) paragraph has already been made, we need to make sure
            # this starts on a new line
            self.out += '\n'
          self.out += '=' * depth
          return
        except ValueError: pass	# non-heading tag -> continue
      
      # font size
      if tag == 'font' and self.hasattr(attrs, 'size'):
        self.out += '<font size="' + self.getattr(attrs, 'size') + '">'
        self.infont += 1
        return
      
      if tag == 'font' and self.hasattr(attrs, 'face'):
        self.out += '<font face="' + self.getattr(attrs, 'face') + '">'
        self.infont += 1
        return
      
      if tag == 'font' and self.hasattr(attrs, 'color'):
        self.out += '<font color="' + self.getattr(attrs, 'color') + '">'
        self.infont += 1
        return
      
      # blockquotes need an extra newline at the end, so they can't be handled by self.passthrough
      if tag == 'blockquote':
        self.out += '<blockquote>'
        return
      
      # any tags that can be passed straight through
      if tag in self.passthrough:
        self.out += '<' + tag + '>'
        return
      
      if tag == 'td':
        self.intd = True
        self.out += '<td>'
        return
      
      # print warning about unhandled tags
      if not self.skip:
        if not tag in self.dontreport:
          print 'STRT', tag, attrs
    
    # returns the specified id in an ebcid link
    def getid(self, idname):
      import re
      #print 'idname',idname,'linkref',self.linkref
      try: id = int(re.search(idname + '=([^&"]*?)[&"]', self.linkref).groups()[0]) # 'xx=123&' or 'xx=123"'
      except:
        try: id = int(re.search(idname + '=([^&"]*?)$', self.linkref).groups()[0])  # xx=123 (end of line)
        except: id = 0	# FIXME: is it better to throw an exception here?
      return id
      
    def handle_endtag(self, tag):
      # count tag nesting depth
      if not self.level.has_key(tag): self.level[tag] = 0
      else:
        if self.level[tag] > 0: self.level[tag] -= 1

      if tag in self.ignore:
        return
      
      if tag in self.squelch:
        self.skip -= 1
        return
      if tag == 'ul' and self.scrapul == True:
        self.scrapul = False
        self.skip -= 1
        return
      
      # end of 'clear' div
      if tag == 'div' and self.scrapclear == True:
        self.scrapclear = False
        self.skip -= 1
        return
      
      if self.skip > 0: return
      
      # end of paragraph
      if tag == 'p':
        if not self.inlist and not self.paragraphed:
          if not self.out.endswith('\n'): self.out += '\n'
          self.out += '\n'
          self.paragraphed = True
        return
      
      # create a link
      if tag == 'a':
        if self.linkref:
          self.lastdata = self.lastdata.strip()
          # if there is no preceding space, add one; exceptions:
          # - no output yet
          # - output ends with newline or = (heading)
          # - output ends with "'" and we are in an italics or bold section
          # - output ends with a right arrow (index entry) link
          if len(self.out) > 0 and not (self.out[-1] in ' \n=("' or (self.out.endswith("'") and (self.italicsdepth > 0 or self.bolddepth > 0)) or self.out.endswith(u'→]]')):
            self.out += ' '
          
          if 'articleId' in self.linkref:	# articles
            id = self.getid('articleId')
            if self.emptytag: self.lastdata = u'→'	# no link text -> index entry link (right arrow)

            # links to articles that start with "BRITANNICA BOOK OF THE YEAR <year>" seem
            # to be the only way to find out what Book of the Year an article belongs to,
            # so we use them to rename BOTY articles
            if self.lastdata.startswith('BRITANNICA BOOK OF THE YEAR '):
              print 'BOOKOFTHEYEARLINK for id',id
              try: print 'ARTICLETITLE ' + articletitle[id].encode('utf-8')
              except: pass
              if articletitle.has_key(id) and not articletitle[id].endswith(' (' + self.lastdata[-4:] + ')'):
                print 'RENAMING',articletitle[id].encode('utf-8')
                alltitles.remove(articletitle[id])
                articletitle[id] = re.sub(' \([0-9]*\)$','', articletitle[id])
                articletitle[id] = articletitle[id] + ' (' + self.lastdata[-4:] + ')'
                sufcount = 1
                sufname = articletitle[id]
                while sufname in alltitles:
                  sufname = suffix(articletitle[id], sufcount)
                articletitle[id] = sufname
                alltitles.add(articletitle[id])
                print 'TO',articletitle[id].encode('utf-8')

            try: self.out += '[[' + articletitle[id] + '|' + self.lastdata + ']]'
            except KeyError: self.out += '[[article ' + str(id) + '|' + self.lastdata + ']]'

          elif 'idxStructId' in self.linkref:	# index entries
            id = self.getid('idxStructId')
            if self.emptytag: self.lastdata = u'→'	# no link text -> index entry link (right arrow)

            # this is a link to an index or year in review article
            # strangely enough, once in a while an ID is not in the database specified,
            # so we try both in either case
            if 'IndexEntryContentIdentifier' in self.linkref:
              try: title = indextitle[id]
              except KeyError:
                try: title = yeartitle[id]
                except KeyError: title = 'index ' + str(id)
            else:
              try: title = yeartitle[id]
              except KeyError:
                try: title = indextitle[id]
                except KeyError: title = 'yearbook ' + str(id)
            
            self.out += '[[' + title + '|' + self.lastdata + ']]'

          else:	# nothing we can deal with
            self.out += '[[' + self.linkref + '|' + self.lastdata + ']]'
          
          self.state = 'dump'	# go back to writing data out immediately
          self.linkref = None
        return
      
      if tag == 'ul' or tag == 'ol':
        self.inlist = False
        return
        
      if tag == 'li':
        self.out += '\n'
        return
      
      if tag == 'i':
        self.italicsdepth -= 1
        if self.italicsdepth == 0:
          #self.out = self.out.rstrip() + "''"
          if self.out.rstrip() != self.out:
            self.out = self.out.rstrip() + ' '
          if self.lastdata.rstrip() != self.lastdata:
            self.lastdata = self.lastdata.rstrip() + ' '
          
          self.handle_data("''")
          self.lastquote = 2
          self.paragraphed = False
        return
      
      if tag == 'b':
        self.bolddepth -= 1
        if self.bolddepth == 0:
          #self.out = self.out.rstrip() + "'''"
          if self.out.rstrip() != self.out:
            self.out = self.out.rstrip() + ' '
          if self.lastdata.rstrip() != self.lastdata:
            self.lastdata = self.lastdata.rstrip() + ' '
          self.handle_data("'''")
          self.lastquote = 3
          self.paragraphed = False
        return

      # if this is a span that we handled somehow, print the code to end it
      if tag == 'span' and self.spanoff:
        if self.spanoff == "''":
          self.italicsdepth -= 1
          if self.italicsdepth == 0:
            self.out = self.out.rstrip() + self.spanoff
            self.lastquote = 2
            self.paragraphed = False
        return
      
      if tag[0] == 'h':
        try:
          depth = int(tag[1])
          if self.out.endswith('='): # empty heading
            self.out += ' '
          self.out += '=' * depth + '\n'
          self.paragraphed = True
          return
        except ValueError: pass
      
      if tag == 'img':
        # nothing to do
        return
      
      if tag == 'font':
        if self.infont > 0:
          self.out += '</font>'
          self.infont -= 1
        return
      
      if tag == 'blockquote':
        self.out += '</blockquote>\n'	# needs an extra newline
        return
      
      # end of div -> end of paragraph
      if tag == 'div':
        # unclosed <a> tags occasionally eat parts of articles
        # let's assume links do not cross div boundaries
        self.state = 'dump'
        if not self.paragraphed:
          self.out += '\n\n'
          self.paragraphed = True
        return
      
      if tag in ['br', 'hr']:
        # nothing to do
        return
        
      if tag in self.passthrough:
        self.out += '</' + tag + '>'
        return
      
      if tag == 'td':
        if self.intd:
          if self.infont > 0:
            self.out += '</font>' * self.infont
            self.infont = 0
          self.out += '</td>'
          self.intd = False
        return
        
      if not self.skip:
        if not tag in self.dontreport:
          print 'END ',tag
        
    # should not be necessary in the final version
    def handle_charref(self, name):
      #breakage
      if name == '146':
        self.handle_data(u'’')
        return
      print 'CHRR', name
      self.handle_data('&#' + name + ';')
    
    # this is necessary even though we handle HTML entities before useing the HTML parser
    # because said parser assumes everything that starts with & to be an HTML entity, even
    # if there is no semicolon at the end
    def handle_entityref(self, name):
      freakystuff = {'dollar': '$', 'lsqb': '[', 'rsqb': ']', 'plus': '+'}
      try:
        self.handle_data(freakystuff[name])
        return
      except KeyError: pass

      # resolving lt and gt may confuse the HTML parser, which
      # is why they are kept
      if name == 'lt' or name == 'gt':
        self.handle_data('&' + name + ';')
      else:
        print 'ENTI', name
        self.handle_data('&' + name)
    
    # plain text handling
    def handle_data(self, data):
      self.emptytag = False
      self.lastdata += data	# FIXME: doesn't this have to be done in non-dump states only?
      if self.skip: return
      if self.state == 'dump':
        # scrap all spaces after newlines because leading spaces have a special meaning in
        # mediawiki markup
        if self.out.endswith('\n'): data = data.lstrip(' ')
        while '\n ' in data: data = data.replace('\n ', '\n')
        
        self.out += data.strip('\n\r')
        for i in data:
          if i.isalpha():
            self.paragraphed = False
            break
      else:
        #print 'DATA', data
        pass
  
  # weed out stupid HTML errors
  stupiderrors = [('<TD <a', '<td><a'),
    ('</TD\n', '</td>\n'),
    ('<td colspan="4"</td', '<td colspan="4"></td'),
    ('</ul</td>', '</ul></td>'),
    ('<td scope"', '<td scope="'),
    ('<LINK\n', '<link>\n'),
    ('<TD colspan="3"</TD', '<td colspan="3"></td'),
    ('<td colspan"', '<td colspan="'),
    ('class="oec_center"\n', 'class="oec_center">\n'),
    ('</a</LI>', '</a></li>'),
  ]
  for i in stupiderrors:
    if i[0] in article:
      print 'WARN','HTML error caught:',i[0]
      article = article.replace(i[0], i[1])
  
  b = BritParser()
  try: b.feed(article)
  except HTMLParser.HTMLParseError, e:
    print 'ERROR', 'failed to parse article:', e
    b.out = "''This article failed to parse: " + str(e) + "''\n\n''Using original HTML text.''\n\n" + article
  
  if postproc:
    if mytitle.startswith('BSL:'):
      studentlib = True
      myt = mytitle[4:]
    else:
      studentlib = False
    # autowikification
    for a,aa in at_bysize:
      #print 'xxx','a',a.encode('utf-8'),'aa',aa.encode('utf-8')
      #aa = a	# the actual text to look for
      # Britannica articles on "A B" are frequently titled "B, A".
      # FIXME: currently excludes titles with suffixes ("(2001)") and namespaces
      #if (not ':' in a) and (not a.endswith(')')) and a.count(', ') == 1:
      #  p = a.index(', ')
      #  aa = a[p+2:] + ' ' + a[:p]
      
      if studentlib:
        if a == myt:	# link to EB article of same name -> we want that in "see also"
          continue
        if aa.startswith('BSL:'):	# link to BSL article if possible
          aa = aa[4:]
          if len(aa) < 5: continue
      
      if aa in b.out:

        x = b.out.index(aa)	# position of article name in the text

        # don't link if immediately preceded by another alphanumeric character
        if x > 0 and b.out[x-1].isalpha():
          continue
        
        # don't link if more than 3 alpha characters follow
        # (fixes cases such as "discovered" -> "disco", "countries" -> "count")
        if b.out[x+len(aa):][:3+1].isalpha(): continue
        # short links that have not been discarded are acronyms -> zero tolerance
        if len(aa) < 5 and b.out[x+len(aa)].isalpha(): continue
        
        if x > 0 and b.out[x-1] == '<':	# "<table>"...
          continue
        
        dont = False
        # check if we are inside a link; if so, don't link
        while x > 0 and b.out[x] != ']' and not dont:
          if b.out[x] == '[':
            dont = True
          x -= 1
        if dont: continue
        
        # check if this page is already linked to
        # (special exception for BSL pages: blocking them causes the corresponding EB article to be
        # linked instead, which is most often undesirable)
        if ('[[' + a + '|' in b.out or '[[' + a + ']]' in b.out) and not a.startswith('BSL:'):
          continue
        
        # create a link unless there is a reason not to
        #if a != aa: print 'a',a,'aa',aa
        if aa != a: b.out = b.out.replace(aa, '[[' + a + '|' + aa + ']]' , 1)
        else: b.out = b.out.replace(a, '[[' + a + ']]', 1)
    
    b.out = b.out.strip()
    if b.out.startswith('==Introduction==\n'): b.out = b.out[17:].strip()
    
    # see also...
    if not (mytitle.startswith('IndexEntry:') or mytitle.startswith('YearInReview:')):
      seealso = False
      mybaretitle = mytitle
      if ':' in mybaretitle: mybaretitle = mybaretitle[mybaretitle.index(':')+1:]
      while re.search(' \([0-9]*\)$', mybaretitle): mybaretitle = re.sub(' \([0-9]*\)$','',mybaretitle)
      av = articletitle.values()
      av.sort()
      if len(mybaretitle) > 4:
        for a in av:
          if a != mytitle and mybaretitle in a and (not '[[' + a + '|' in b.out) and (not '[[' + a + ']]' in b.out):
            if not seealso:
              b.out += '\n\n==See also==\n'
              seealso = True
            b.out += '* [[' + a
            if a.startswith('BSL:'):
              b.out += '|' + a[4:] + "]] <small>(Britannica Student Library)</small>\n"
            elif a.startswith('BookOfTheYear:'):
              b.out += '|' + a[14:] + "]] <small>(Britannica Book of the Year)</small>\n"
            else:
              b.out += "]] <small>(Encyclopaedia Britannica)</small>\n"
      else:
        for a in av:
          if a == mytitle: continue
          abare = a
          if not mybaretitle in abare: continue
          if ':' in abare:
            abare = abare[abare.index(':')+1:]
          while re.search(' \([0-9]*\)$', abare): abare = re.sub(' \([0-9]*\)$','',abare)
          if not (abare == mybaretitle or \
             ' '+mybaretitle+' ' in abare or \
             abare.startswith(mybaretitle+' ') or \
             abare.endswith(' '+mybaretitle)): continue
          if not seealso:
            b.out += '\n\n==See also==\n'
            seealso = True
          b.out += '* [[' + a
          if a.startswith('BSL:'):
            b.out += '|' + a[4:] + "]] <small>(Britannica Student Library)</small>\n"
          elif a.startswith('BookOfTheYear:'):
            b.out += '|' + a[14:] + "]] <small>(Britannica Book of the Year)</small>\n"
          else:
            b.out += "]] <small>(Encyclopaedia Britannica)</small>\n"
  return b.out
  

pth = ebdir + '/data/ARTICLE_DATA/articledb/'

pidx = open(pth + 'EB/EB.pidx','r')
data = open(pth + 'EB/EB.data','r')
cache = open(pth + 'EB/lobs/CACHE_VALUE', 'r')

stpidx = open(pth + 'EBI/EBI.pidx','r')
stdata = open(pth + 'EBI/EBI.data','r')
stcache = open(pth + 'EBI/lobs/CACHE_VALUE', 'r')

ipidx = open(pth + 'TOPIC_MAP_INDEXENTRY/TOPIC_MAP_INDEXENTRY.pidx', 'r')
idata = open(pth + 'TOPIC_MAP_INDEXENTRY/TOPIC_MAP_INDEXENTRY.data', 'r')
icache = open(pth + 'TOPIC_MAP_INDEXENTRY/lobs/CACHE_VALUE', 'r')

ypidx = open(pth + 'TOPIC_MAP_YEARBOOKS/TOPIC_MAP_YEARBOOKS.pidx', 'r')
ydata = open(pth + 'TOPIC_MAP_YEARBOOKS/TOPIC_MAP_YEARBOOKS.data', 'r')
ycache = open(pth + 'TOPIC_MAP_YEARBOOKS/lobs/CACHE_VALUE', 'r')

mpidx = open(ebdir + '/data/IMAGE_DATA/imagedb/IMAGE/IMAGE.pidx','r')
mdata = open(ebdir + '/data/IMAGE_DATA/imagedb/IMAGE/IMAGE.data','r')
mcache = open(ebdir + '/data/IMAGE_DATA/imagedb/IMAGE/lobs/CACHE_VALUE', 'r')

if struct.unpack('>Q',pidx.read(8)) != (0,):
  idxformat = '>I'
  idxlen = 4
else:
  idxformat = '>Q'
  idxlen = 8

count = 0

def getentry(idx, dat):
  i, = struct.unpack(idxformat, idx.read(idxlen))
  dat.seek(i)
  return struct.unpack('>IxQI', dat.read(17))

def suffix(title, count):
  return title + ' (' + str(count) + ')'

def getarticle(cach, offset, length, id, hash, lhash, prefix = ''):
  cach.seek(offset)
  article = unicode(unpack(cach.read(length)), 'utf-8')
  if hash.has_key(id):
    title = hash[id]
  else:
    title = re.search('<title>(.*?)</title>', article, re.DOTALL).groups()[0]
    for k,v in htmlentitydefs.name2codepoint.items():
      # < and > may confuse the HTML parser, so we keep these entities
      if k != 'lt' and k != 'gt':
        title = title.replace('&' + k + ';', unichr(v))

    if 'Britannica Book of the Year Article' in article:
      title = "BookOfTheYear:" + title
    title = title.replace('Index Entry: ', 'IndexEntry:')
    if title.startswith('Year in Review: '): title = title.replace('Year in Review: ', 'YearInReview:')
    title = title.replace('Document: ', 'Document:')
    title = prefix + title
    
    # replace line breaks with spaces
    nl = re.compile('\s+',re.S)
    title = nl.sub(' ', title).strip()

    if title in alltitles:
      for k,v in hash.items():
        if v == title:
          if lhash[k] < length:
            # rename the existing article
            count = 2
            while suffix(v, count) in alltitles:
              count += 1
            hash[k] = suffix(v, count)
            hash[id] = title
            alltitles.add(hash[k])
            break
          else:
            # rename this article
            count = 2
            while suffix(title, count) in alltitles:
              count += 1
            hash[id] = suffix(title, count)
            alltitles.add(hash[id])
            break
      else:
        count = 2
        while suffix(title, count) in alltitles:
          count += 1
        hash[id] = suffix(title, count)
        alltitles.add(hash[id])
      #print 'newtitle',hash[id]
      #time.sleep(1)
    else:
      hash[id] = title
      alltitles.add(title)
    
    lhash[id] = length

  for k,v in htmlentitydefs.name2codepoint.items():
    # < and > may confuse the HTML parser, so we keep these entities
    if k != 'lt' and k != 'gt':
      article = article.replace('&' + k + ';', unichr(v))

  return (hash[id], article)

def getbinary(cach, offset, length):
  cach.seek(offset)
  binary = cach.read(length)
  return binary

def addarticle(title, article, debug_type, debug_offset, debug_length, debug_idxoff):
  article = cleanandlink(article, title)
  article += "\n\n''" + debug_type + ' offset ' + str(debug_offset) + ' length ' + str(length) + ' idxoff ' + str(debug_idxoff) + "''\n"
  
  #print article.encode('utf-8')

  if False: # use api.php via HTTP
    p = urllib.urlopen('http://localhost/eb/api.php?action=query&prop=info&titles=' + urllib.quote(title.encode('utf-8')) + '&intoken=edit&format=xml')
    pp = p.read()
    #print 'pp',pp

    try: tok = re.search('edittoken="(.*?)"', pp).groups()[0]
    except AttributeError:
      print 'WARNING','could not get edit token'
      return

    try: stamp = re.search('touched="(.*?)"', pp).groups()[0]
    except: stamp = ''
    #print 'token',tok
    
    import httplib
    params = urllib.urlencode({'epedittoken': tok,
                               'epsummary': 'blubb',
                               'eptext': article.encode('utf-8'),
                               'disablemerge': 'yes',
                               'epedittime': stamp,
                              })
    headers = {"Content-type": "application/x-www-form-urlencoded",
               "Accept": "text/plain"}
    conn = httplib.HTTPConnection("localhost:80")
    conn.request("POST", "/eb/api.php?action=edit&format=xml&eptitle=" + urllib.quote(title.encode('utf-8')), params, headers)
    response = conn.getresponse()
    print response.status, response.reason
    resp = response.read()
    conn.close()
  else: # use edit.php locally
    p = os.popen('php ./maintenance/edit.php "' + title.replace('"','\\"').encode('utf-8') + '"', 'w')
    try:
      p.write(article.encode('utf-8'))
      p.close()
    except IOError,e:
      print 'ERROR','I/O error:',e


try:
  f = open('imgtype.pickle','r')
  imgtype = pickle.load(f)
except:
  while True:
    try: id, offset, length = getentry(mpidx, mdata)
    except: break
    print 'id', id, 'offset', offset, 'length', length
    if length == 0: continue
    image = getbinary(mcache, offset, length)
    if image[6:10] == 'JFIF': imgtype[id] = 'jpg'
    elif image[0:3] == 'GIF': imgtype[id] = 'gif'
    else: shit()
    open('./eimages/' + str(id) + '.' + imgtype[id], 'w').write(image)
    # FIXME: import to Mediawiki (maintenance/importImages.php)
  f = open('imgtype.pickle', 'w')
  pickle.dump(imgtype, f)
  f.close()

for _pass in [1,2]:
  try:
    print 'pass', _pass
    if _pass == 1:
      try:
        f = open('titles.pickle', 'r')
        articletitle = pickle.load(f)
        indextitle = pickle.load(f)
        yeartitle = pickle.load(f)
        alltitles = pickle.load(f)
        # no need to save the length dicts, they are only used in pass 1
        continue
      except: pass
      pidx.seek(0)
      ipidx.seek(0)
      ypidx.seek(0)
      stpidx.seek(0)
    else:
      # create a list of article titles sorted by size (descending)
      # used to autowikify articles
      at_bysize = articletitle.values()
      def c(x,y):
        if len(x) == len(y): return 0
        elif len(x) > len(y): return -1
        else: return 1
      at_bysize.sort(c)
      x = list()
      for i in at_bysize:
        if (len(i) > 4 or (i.isalpha() and i.upper() == i and len(i) > 2)) \
           and not (i in wiki_blacklist or (i.startswith('BSL:') and i[4:] in wiki_blacklist)):
          x += [i]
      at_bysize = x
      
      x = list()
      for a in at_bysize:
        aa = a
        # Britannica articles on "A B" are frequently titled "B, A".
        # FIXME: currently excludes titles with suffixes ("(2001)")
        if (not ':' in a) and (not a.endswith(')')) and a.count(', ') == 1:
          p = a.index(', ')
          aa = a[p+2:] + ' ' + a[:p]
        elif (a.startswith('BookOfTheYear:') or a.startswith('BSL:')) and (not a.endswith(')')) and a.count(', ') == 1:
          pref, nopref = a.split(':', 1)
          p = nopref.index(', ')
          aa = pref + ':' + nopref[p+2:] + ' ' + nopref[:p]
          
        x += [(a, aa)]
      at_bysize = x
      
      try:
        f = open('index.pickle', 'r')
        pidx.seek(pickle.load(f))
        ipidx.seek(pickle.load(f))
        ypidx.seek(pickle.load(f))
      except:
        pidx.seek(0)
        ipidx.seek(0)
        ypidx.seek(0)
      try:
        stpidx.seek(pickle.load(f))
      except:
        stpidx.seek(0)

    count = 0
    while True:
      count += 1
      #if _pass == 2 and count > 10: break
      #if count > 100: break
      try: id, offset, length = getentry(ipidx, idata)
      except: break
      print 'id', id, 'offset', offset, 'length', length, 'idxoff', ipidx.tell()
      title, article = getarticle(icache, offset, length, id, indextitle, indexl)
      print 'title',title.encode('utf-8')
      if _pass == 2:
        addarticle(title, article, 'index', offset, length, ipidx.tell())
        #print article.encode('utf-8')

    count = 0
    while True:
      #if _pass == 2 and count > 20: break
      #if _pass == 1 and count > 100: break
      try: id, offset, length = getentry(stpidx, stdata)
      except: break
      print 'id',id,'offset',offset,'length',length,'idxoff',stpidx.tell()
      
      title, article = getarticle(stcache, offset, length, id, articletitle, articlel, 'BSL:')

      print 'title',title.encode('utf-8')
      #if _pass == 2 and title.startswith('BookOfTheYear') and count > 4: continue
      count += 1

      if _pass == 2:
        addarticle(title, article, 'article', offset, length, stpidx.tell())
        #break

    count = 0
    while True:
      #if _pass == 2 and count > 20: break
      #if _pass == 1 and count > 100: break
      try: id, offset, length = getentry(pidx, data)
      except: break
      print 'id',id,'offset',offset,'length',length,'idxoff',pidx.tell()
      
      title, article = getarticle(cache, offset, length, id, articletitle, articlel)

      print 'title',title.encode('utf-8')
      #if _pass == 2 and title.startswith('BookOfTheYear') and count > 4: continue
      count += 1

      if _pass == 2:
        addarticle(title, article, 'article', offset, length, pidx.tell())
        #break

    count = 0
    while True:
      count += 1
      #if _pass == 2 and count > 10: break
      #if count > 100: break
      try: id, offset, length = getentry(ypidx, ydata)
      except: break
      print 'id', id, 'offset', offset, 'length', length, 'idxoff', ypidx.tell()
      title, article = getarticle(ycache, offset, length, id, yeartitle, yearl)
      print 'title',title.encode('utf-8')
      if _pass == 2:
        addarticle(title, article, 'yearinreview', offset, length, ypidx.tell())
        #print article.encode('utf-8')
      else:
        cleanandlink(article, title, postproc = False)	# side effect: renames BOTY articles

    if _pass == 1:
      f = open('titles.pickle', 'w')
      pickle.dump(articletitle, f)
      pickle.dump(indextitle, f)
      pickle.dump(yeartitle, f)
      pickle.dump(alltitles, f)
      f.close()

  except KeyboardInterrupt:
    if _pass == 2:
      f = open('index.pickle', 'w')
      pickle.dump(max(0, pidx.tell() - 8), f)
      pickle.dump(max(0, ipidx.tell() - 8), f)
      pickle.dump(max(0, ypidx.tell() - 8), f)
      pickle.dump(max(0, stpidx.tell() - 8), f)
      f.close()
    print 'aborted, restart to resume'
    sys.exit(1)