
# Author: Skip Montanaro (skip@mojam.com)
# Version: 0.1

"""Read SYLK files

Caveat emptor!  This module has only been tested with SYLK files generated
by AppleWorks 5.0!  It almost certainly needs work to be able to process
files generated by other spreadsheets.
"""

import sys
import re
import string
import time
import types

class Array:
    """simple array representation of a spreadsheet grid"""
    def __init__(self):
        self.rows = []
    def __setitem__(self, (x,y), val):
        ox = x - 1
        oy = y - 1
        while len(self.rows) < y:
            self.rows.append([])
        for row in self.rows:
            while len(row) < x:
                row.append(" ")
        self.rows[oy][ox] = val
    def writecsv(self, stream):
        for row in self.rows:
            stream.write('"')
            stream.write(string.join(row, '","'))
            stream.write('"\n')
        stream.flush()
        
class SYLK:
    """class to read SYLK files and dump to CSV"""

    # when time began
    # different computers use different base dates and store dates as offsets
    # this makes SYLK inherently unportable, but we fudge that by
    # using the ID field to guess at the creating platform
    # note that PCs apparently can't properly decode SYLK files generated
    # on Macs using Appleworks/Clarisworks because they don't take this into
    # account
    unixepoch = (1970,1,1,0,0,0,0,0,0)
    macepoch = (1904,1,1,0,0,0,0,0,0)
    # this is pure fiction...
    pcepoch = (1900,1,1,0,0,0,0,0,0)

    # map SYLK format strings into data types
    knownformats = {
        'General': 'string',
        '0': 'int',
        '0.00': 'float',
        '#,##0': 'int',
        '#,##0.00': 'float',
        '"$"#,##0\ ;;\("$"#,##0\,': 'float',
        '"$"#,##0.00\ ;;\("$"#,##0.00\,': 'float',
        '0%': 'float',
        '0.00%': 'float',
        '0.00E+00': 'float',
        'm/d/yy': 'date',
        'd-mmm-yy': 'date',
        'd-mmm': 'date',
        'mmm-yy': 'date',
        'h:mm AM/PM': 'time',
        'h:mm:ss AM/PM': 'time',
        'h:mm': 'time',
        'h:mm:ss': 'time',
        'hh:mm AM/PM': 'time',
        'hh:mm:ss AM/PM': 'time',
        'h:mm': 'time',
        'h:mm:ss': 'time',
        'm/d/yy h:mm': 'datetime',
        'm-dd-yy': 'date',
        'm-dd': 'date',
        '"$"#,##0 ;;[Red]("$"#,##0,': 'float',
        '"$"#,##0.00 ;;[Red]("$"#,##0.00,': 'float',
        'mmm d, yyyy': 'date',
        'mmmm d, yyyy': 'date',
        'ddd, mmm d, yyyy': 'date',
        'dddd, mmmm d, yyyy': 'date',
        'd, mmmm yyyy': 'date',
    }

    def __init__(self):
        self.datebase = self.unixepoch
        self.printformats = []
        self.currentformat = self.currenttype = ""
        self.curx = self.cury = 0
        self.data = Array()
        self.unknown = {}
        
    def escape(self,s):
        if s[0:1] == '"':
            return '"' + re.sub('"', '\\"\\"', s[1:-1]) + '"'
        return s

    def parse(self,stream):
        lines = string.split(re.sub("[\r\n]+", "\n", stream.read()), "\n")
        for line in lines:
            self.parseline(line)

    def writecsv(self, stream):
        self.data.writecsv(stream)
        
    def addunknown(self,fld,subfld):
        self.unknown[fld] = self.unknown.get(fld, {})
        self.unknown[fld][subfld] = 1

    def writeunknown(self,stream):
        if self.unknown:
            stream.write("Unrecognized fields (subfields):\n")
            for key in self.unknown.keys():
                stream.write("%s (%s)\n" % (key, `self.unknown[key].keys()`))
        else:
            stream.write("No unrecognized fields\n")
        stream.flush()
        
    def parseline(self,line):
        fields = re.split("(?i);(?=[a-z])", line)
        if fields[0] == "ID":
            if fields[1][:6] in ("PClari", "PApple"):
                self.datebase = self.macepoch
        if fields[0] == "F":
            for f in fields[1:]:
                ftd = f[0]
                val = f[1:]
                if ftd == "X":
                    self.curx = int(val)
                elif ftd == "Y":
                    self.cury = int(val)
                elif ftd == "P":
                    # references print format for the next cell
                    self.currentformat, self.currenttype = \
                                        self.printformats[int(val)]
                else:
                    self.addunknown("F",ftd)
        elif fields[0] == "C":
            for f in fields[1:]:
                ftd = f[0]
                val = f[1:]
                if ftd == "X":
                    self.curx = int(val)
                elif ftd == "Y":
                    self.cury = int(val)
                elif ftd == "K":
                    val = eval(self.escape(val))
                    if type(val) == types.IntType:
                        if self.currenttype == "date":
                            # value is offset in days from datebase
                            date = time.localtime(time.mktime(self.datebase)+
                                                  float(val)*24*60*60)
                            val = time.strftime("%m/%d/%y", date)
                    self.data[(self.curx,self.cury)] = "%s"%val
                else:
                    self.addunknown("C",ftd)
        elif fields[0] == "P":
            # print formats imply data types?
            if fields[1][0] == "P":
                format = string.replace(fields[1][1:], "\\", "")
                if self.knownformats.has_key(format):
                    self.printformats.append((format,
                                              self.knownformats[format]))
                else:
                    # hack to guess type...
                    hasY = "y" in format
                    hasD = "d" in format
                    hasH = "h" in format
                    hasZ = "0" in format
                    hasP = "." in format
                    if (hasD or hasY) and hasH:
                        dtype = "datetime"
                    elif hasD or hasY:
                        dtype = "date"
                    elif hasH:
                        dtype = "time"
                    elif hasP and hasZ:
                        dtype = "float"
                    elif hasZ:
                        dtype = "int"
                    else:
                        dtype = "string"
                    self.printformats.append((format, dtype))
            else:
                self.addunknown("P",fields[1][0])
        else:
            fld = fields[0]
            for f in fields[1:]:
                ftd = f[0]
                self.addunknown(fld,f)
