# Author: Skip Montanaro (skip@mojam.com) # Version: 0.1 """Read SYLK files Caveat emptor! This module has only been tested with SYLK files generated by AppleWorks 5.0! It almost certainly needs work to be able to process files generated by other spreadsheets. """ import sys import re import string import time import types class Array: """simple array representation of a spreadsheet grid""" def __init__(self): self.rows = [] def __setitem__(self, (x,y), val): ox = x - 1 oy = y - 1 while len(self.rows) < y: self.rows.append([]) for row in self.rows: while len(row) < x: row.append(" ") self.rows[oy][ox] = val def writecsv(self, stream): for row in self.rows: stream.write('"') stream.write(string.join(row, '","')) stream.write('"\n') stream.flush() class SYLK: """class to read SYLK files and dump to CSV""" # when time began # different computers use different base dates and store dates as offsets # this makes SYLK inherently unportable, but we fudge that by # using the ID field to guess at the creating platform # note that PCs apparently can't properly decode SYLK files generated # on Macs using Appleworks/Clarisworks because they don't take this into # account unixepoch = (1970,1,1,0,0,0,0,0,0) macepoch = (1904,1,1,0,0,0,0,0,0) # this is pure fiction... pcepoch = (1900,1,1,0,0,0,0,0,0) # map SYLK format strings into data types knownformats = { 'General': 'string', '0': 'int', '0.00': 'float', '#,##0': 'int', '#,##0.00': 'float', '"$"#,##0\ ;;\("$"#,##0\,': 'float', '"$"#,##0.00\ ;;\("$"#,##0.00\,': 'float', '0%': 'float', '0.00%': 'float', '0.00E+00': 'float', 'm/d/yy': 'date', 'd-mmm-yy': 'date', 'd-mmm': 'date', 'mmm-yy': 'date', 'h:mm AM/PM': 'time', 'h:mm:ss AM/PM': 'time', 'h:mm': 'time', 'h:mm:ss': 'time', 'hh:mm AM/PM': 'time', 'hh:mm:ss AM/PM': 'time', 'h:mm': 'time', 'h:mm:ss': 'time', 'm/d/yy h:mm': 'datetime', 'm-dd-yy': 'date', 'm-dd': 'date', '"$"#,##0 ;;[Red]("$"#,##0,': 'float', '"$"#,##0.00 ;;[Red]("$"#,##0.00,': 'float', 'mmm d, yyyy': 'date', 'mmmm d, yyyy': 'date', 'ddd, mmm d, yyyy': 'date', 'dddd, mmmm d, yyyy': 'date', 'd, mmmm yyyy': 'date', } def __init__(self): self.datebase = self.unixepoch self.printformats = [] self.currentformat = self.currenttype = "" self.curx = self.cury = 0 self.data = Array() self.unknown = {} def escape(self,s): if s[0:1] == '"': return '"' + re.sub('"', '\\"\\"', s[1:-1]) + '"' return s def parse(self,stream): lines = string.split(re.sub("[\r\n]+", "\n", stream.read()), "\n") for line in lines: self.parseline(line) def writecsv(self, stream): self.data.writecsv(stream) def addunknown(self,fld,subfld): self.unknown[fld] = self.unknown.get(fld, {}) self.unknown[fld][subfld] = 1 def writeunknown(self,stream): if self.unknown: stream.write("Unrecognized fields (subfields):\n") for key in self.unknown.keys(): stream.write("%s (%s)\n" % (key, `self.unknown[key].keys()`)) else: stream.write("No unrecognized fields\n") stream.flush() def parseline(self,line): fields = re.split("(?i);(?=[a-z])", line) if fields[0] == "ID": if fields[1][:6] in ("PClari", "PApple"): self.datebase = self.macepoch if fields[0] == "F": for f in fields[1:]: ftd = f[0] val = f[1:] if ftd == "X": self.curx = int(val) elif ftd == "Y": self.cury = int(val) elif ftd == "P": # references print format for the next cell self.currentformat, self.currenttype = \ self.printformats[int(val)] else: self.addunknown("F",ftd) elif fields[0] == "C": for f in fields[1:]: ftd = f[0] val = f[1:] if ftd == "X": self.curx = int(val) elif ftd == "Y": self.cury = int(val) elif ftd == "K": val = eval(self.escape(val)) if type(val) == types.IntType: if self.currenttype == "date": # value is offset in days from datebase date = time.localtime(time.mktime(self.datebase)+ float(val)*24*60*60) val = time.strftime("%m/%d/%y", date) self.data[(self.curx,self.cury)] = "%s"%val else: self.addunknown("C",ftd) elif fields[0] == "P": # print formats imply data types? if fields[1][0] == "P": format = string.replace(fields[1][1:], "\\", "") if self.knownformats.has_key(format): self.printformats.append((format, self.knownformats[format])) else: # hack to guess type... hasY = "y" in format hasD = "d" in format hasH = "h" in format hasZ = "0" in format hasP = "." in format if (hasD or hasY) and hasH: dtype = "datetime" elif hasD or hasY: dtype = "date" elif hasH: dtype = "time" elif hasP and hasZ: dtype = "float" elif hasZ: dtype = "int" else: dtype = "string" self.printformats.append((format, dtype)) else: self.addunknown("P",fields[1][0]) else: fld = fields[0] for f in fields[1:]: ftd = f[0] self.addunknown(fld,f)