''' rsonlite -- an extremely lightweight version of rson. Copyright (c) 2012, Patrick Maupin License :: MIT http://pypi.python.org/pypi/rsonlite http://code.google.com/p/rson/ rsonlite makes it easy to build a file parser for declarative hierarchical data structures using indentation. (Spaces only, tabs not considered indentation.) The only special characters are '#', '=', and indentation: - Indentation denotes a key/value relationship. The value is indented from the key. - = Denotes the start of a free-format string. These strings can contain '=' and '#' characters, and even be multi-line, but every line in the string must be indented past the initial equal sign. Note that, for multi-line strings, indentation is preserved but normalized such that at least one line starts in the left column. This allows for restructuredText or Python code to exist inside multi-line strings. - # Denotes the start of a line comment, when not inside a free-format string. The only Python objects resulting from parsing a file with rsonlite are: - strings: free-format strings (described above) can contain any character, but the whitespace before/after the string may be stripped. Regular strings must fit on a single line and cannot contain '=' or '#' characters. Regular strings may be used as keys in key/value pairs, but free-format strings may not. - tuple: A key/value pair is a two-element tuple. The key is always a string. The value is always a list. - list: The top level is a list, and the value element of every key/value pair tuple is also a list. Lists can contain strings and key/value pair tuples. ''' import re version = __version__ = '0.1.0' # Our attempt at rationalizing differences between Python 2 and Python 3. try: basestring except NameError: basestring = str class unicode: pass # Use OrderedDict if it's available try: from collections import OrderedDict as stddict except ImportError: stddict = dict # Splits the entire file into probable tokens. splitter = re.compile('(\n *|=[^\n]*|#[^\n]*|[^\n#=]+)').findall class RsonToken(str): ''' A string that may be annotated with location information ''' def __new__(cls, s, line, col): self = str.__new__(cls, s) self.line = line self.col = col return self def __add__(self, other): return RsonToken(str(self) + other, self.line, self.col) def gettoks(source): ''' Convert string into (probable) tokens (some tokens may be recombined later, e.g. if they contain # or = but were already inside a string) ''' # Use "regular" strings, whatever that means for the given Python if isinstance(source, unicode): source = source.encode('utf-8', 'replace') elif not isinstance(source, basestring): source = source.decode('utf-8', 'replace') # Convert MS-DOS or Mac line endings to the one true way, and # prefix the source with a linefeed to simplify the tokenization. source = '\n' + source.replace('\r\n', '\n').replace('\r', '\n') line = 0 for tok in splitter(source): if tok.startswith('\n'): line += 1 col = len(tok) else: yield RsonToken(tok, line, col) col += len(tok) def multiline(lineinfo, dedent): ''' Returns one string for each line, properly dedented. ''' linenum = lineinfo[0].line for tok in lineinfo: while linenum < tok.line: yield '' linenum += 1 yield (tok.col - dedent) * ' ' + tok.rstrip() linenum += 1 def getfreeformat(toklist, firsttok, firstcol): ''' Returns a free-formatted string. ''' curline = firsttok.line firstpart = firsttok[1:].strip() # Get past = sign lineinfo = [] while toklist and toklist[-1].col > firstcol: tok = toklist.pop() if tok.line == curline: lineinfo[-1] += tok else: lineinfo.append(tok) curline = tok.line if lineinfo: dedent = min(tok.col for tok in lineinfo) if firstpart: lineinfo.insert(0, RsonToken(firstpart, firsttok.line, dedent)) firstpart = '\n'.join(multiline(lineinfo, dedent)) return RsonToken(firstpart, firsttok.line, firsttok.col) def loads(source): ''' load a string into an rsonlite datastructure. If the source is not a string instance, then loads will attempt to convert it into a string instance, by encoding to UTF-8 on Python 2, or decoding from UTF-8 on Python 3. ''' toklist = list(gettoks(source)) toklist.reverse() result = [None] stack = [] curcol = -1 curlist = result while toklist: tok = toklist.pop() if tok.startswith('#'): continue col = tok.col if col > curcol: stack.append((curcol, curlist)) oldlist = curlist curcol, curlist = col, [] oldlist[-1] = oldlist[-1], curlist while col < curcol: curcol, curlist = stack.pop() if col != curcol: err = IndentationError('unindent does not match any outer indentation level') err.filename = '' err.lineno = tok.line raise err if tok.startswith('='): curlist.append(getfreeformat(toklist, tok, col)) else: curlist.append(RsonToken(tok.rstrip(), tok.line, tok.col)) if toklist and toklist[-1].line == tok.line: tok = toklist.pop() if tok.startswith('='): curlist[-1] = curlist[-1], [getfreeformat(toklist, tok, col)] else: assert tok.startswith('#') # else problem in regex... result, = result return [] if result is None else result[1] def dumps(data, indent=' ', initial_indent=''): ''' Dump a string loaded with loads back out. ''' def getstring(data, indent2): if '\n' in data: data = ('\n'+indent2).join([''] + data.split('\n')) return data def recurse(data, indent2): assert isinstance(data, list), repr(data) for data in data: if isinstance(data, tuple): key, value = data if len(value) == 1 and isinstance(value[0], basestring): append('%s%s = %s' % (indent2, key, getstring(value[0], indent2+indent))) else: append('%s%s' % (indent2, key)) recurse(value, indent2 + indent) else: assert isinstance(data, basestring) if '\n' in data or '=' in data or '#' in data: append(indent2 + '=') append(getstring(data, indent2 + ' ')) else: append('%s%s' % (indent2, data)) result = [] append = result.append recurse(data, initial_indent) append('') return '\n'.join(result) def pretty(data, indent=' '): ''' Pretty-print a string loaded by loads into something that makes it easy to see the actual structure of the data. The return value of this should be parseable by eval() ''' def recurse(data, indent2): assert isinstance(data, list) for data in data: assert isinstance(data, (tuple, basestring)) if isinstance(data, tuple) and ( len(data[1]) != 1 or not isinstance(data[1][0], basestring)): append('%s(%s, [' % (indent2, repr(data[0]))) recurse(data[1], indent2 + indent) append('%s])' % (indent2)) else: append('%s%s,' % (indent2, repr(data))) result = [] append = result.append append('[') recurse(data, indent) append(']') append('') return '\n'.join(result) ########################################################################## # These higher-level functions might suffice for simple data, and also # provide a template for designing similar functions. def stringparse(s, special=dict(true=True, false=False, null=None)): ''' This gives an example of handling the JSON special identifiers true, false and null, and also of handling simple arrays. ''' if s in special: return special[s] if s.startswith('[') and s.endswith(']'): t = s[1:-1] for ch in '"\'[]{}\n': if ch in t: return s return [x.strip() for x in t.split(',')] return s def simpleparse(source, stringparse=stringparse, stddict=stddict): ''' Return the simplest structure that uses dicts instead of tuples, and doesn't lose any source information. Use ordered dicts if they are available. ''' def recurse(mylist): if len(mylist) == 1 and isinstance(mylist[0], basestring): return stringparse(mylist[0]) keys = [x[0] for x in mylist if isinstance(x, tuple)] if not keys: return mylist # simple list if len(set(keys)) == len(mylist): return stddict((x, recurse(y)) for (x, y) in mylist) # Complicated. Make a list that might have multiple dicts result = [] curdict = None for item in mylist: if not isinstance(item, tuple): result.append(stringparse(item)) curdict = None continue key, value = item if curdict is None or key in curdict: curdict = stddict() result.append(curdict) curdict[key] = recurse(value) return result return recurse(source if isinstance(source, list) else loads(source))