# @Brief:         Convert xml -> delimented ( an xml flattener )
# @Author:        Erik Schweller ( othererik .at. g mail .dot. com )
# @Copyright:     Copyright 2010
#  A two pass system for converting any generic xml file
#  to a delimited file ( flattens an xml file ).
#  Converts both elements and element-attribute
#  pairs to columns.
#  For attributes on each tag, a "1" is placed in column if the attribute
#  is present for the current row, otherwise the cell is left blank.

# TODO: choose best line ending based on data
# TODO: choose best delimiter based on data

# python
import getopt
import os
import sys

# lxml
from lxml import etree

__version__ = '1.0'
__author__ = 'Erik Schweller'
__date__ = '2010/01/24'

# GLOBAL DEFAULT SETTINGS
LINE_ENDING_STYLE = 'unix'
DELIMITER = '\t'
WRITE_HEADER = True
# if the follwing is None and no commandline override,
# most repeated tag will be used for row breaks
ROW_BREAK_TAG = None

class XMLToDelimited( object ):
    def __init__( self, row_break_tag = None, line_ending_style = 'unix',
        delimiter = '\t', write_header = True, long_attr_name = False ):
            ending_style = { 'windows': '\r\n', 'unix': '\n', 'mac': '\r' }

            assert line_ending_style in ending_style, \
                'line_ending_style must be one of %s' % ( str( ending_style.keys() ) )

            self._line_ending = ending_style[ line_ending_style ]
            self._delimiter = delimiter
            self._write_header = write_header
            self._row_break_tag = row_break_tag

            # attributes can be written to header in two ways
            self._use_long_attr_name = long_attr_name

            # defined at call to convert_to_delimited
            self._out_file = None
            self._in_file = None

            # setup warnings
            self._warnings = dict()

    def __call__( self, xml_file, output_file = 'output.txt', user_columns = None ):
        '''Callable version - directs to convert_to_delimited'''
        return self.convert_to_delimited( xml_file, output_file, user_columns )

    # driver
    def convert_to_delimited( self, xml_file, output_file = 'output.txt', user_columns = None ):
        '''Convert xml_file to output_file'''
        assert os.path.exists( xml_file ), 'Error, %s does not exist' % ( xml_file )
        try:
            try:
                out = open( output_file, 'wb' )
            except Exception, err:
                print 'Failed to open %s' % ( output_file )
                print err
                return False

            self._in_file = xml_file # string
            self._out_file = out #  stream

            columns = self._find_columns( user_columns )
            if len( columns ) == 0:
                print 'No columns found in', xml_file
                return False

            keys = columns.keys()
            keys.sort( lambda x, y: cmp( columns[ x ], columns[ y ] ) )
            if self._row_break_tag is None:
                self._row_break_tag = keys[ -1 ] # largest count
                print 'Automatic row break tag set to <', self._row_break_tag, '>'
            if self._write_header:
                try:
                    self._write_file_header( columns )
                except Exception, err:
                    print 'Failed to write header to %s' % ( output_file )
                    print err
                    return False
            try:
                self._dump_to_file( columns )
            except Exception, err:
                print 'Failed when processing data from %s'  % ( xml_file )
                print err
                return False

            for w in self._warnings:
                if self._warnings[ w ]:
                    print 'WARNING:', w
        finally:
            # clear warnings, etc.
            self._warnings = dict()
        return True

    # support functions - file out
    def _write_file_header( self, columns ):
        '''Writes header to the output file'''
        for c in sorted( columns ):
            self._out_file.write( c )
            self._out_file.write( self._delimiter )
        self._out_file.write( self._line_ending )

    def _write_row( self, row, columns ):
        '''Writes one row at a time to the output file'''
        for id in sorted( columns ):
            if id in row:
                self._out_file.write( row[ id ] )
            self._out_file.write( self._delimiter )
        self._out_file.write( self._line_ending )

    def _dump_to_file( self, columns ):
        '''Fill output file with data (  text and attribute values  )'''
        row = dict()
        if self._delimiter == '"':
            comment = "'"
        else:
            comment = '"'
        for item in self._walk_file( self._in_file ):
            attr = item.attrib
            tag = item.tag
            text = item.text
            # for simplicity, '\n' is removed if discovered
            if '\n' in text:
                text = text.replace( '\n', '' )
                self._warnings[ 'New line character "\\n" Removed From Data' ] = True
            if tag in columns:
                # preserves white space if delimiter == whitespace
                if self._delimiter in text:
                    text = comment + text + comment
                    self._warnings[ 'Current Delimiter Removed From Data' ] = True
                # remove newline character and set warning
                if text.find( self._line_ending ) != -1:
                    text = text.replace( self._line_ending, '' )
                    self._warnings[ 'Current Line Ending Removed From Data' ] = True
                row[ tag ] = text if len( text ) > 0 else ''

            if len( attr ) > 0:
                for a in attr:
                    if self._use_long_attr_name:
                        atag = '<' + tag + ' ' + a + '=' + "'" +attr[ a ] + "'>"
                    else:
                        atag = tag + '_' + a + '_' + attr[ a ]
                    row[ atag ] = '1'

            # write the row if
            if self._row_break_tag  ==  tag and len( row ) > 0:
                self._write_row( row, columns )
                row = dict()

    # xml parsing
    def _walk_file( self, file ):
        '''Memory friendly walk though an xml file'''
        # throwing away the event
        for event, item in etree.iterparse( file ):
            # do some cleanup to save memory
            if len( item ) > 0:
                previous_item = item.getprevious()
                while previous_item is not None and len( previous_item ) > 0:
                    p = previous_item.getparent()
                    if len( p ) > 0: p.remove( previous_item )
                    previous_item = item.getprevious()
            yield item

    def _find_columns( self, user_columns = None ):
        '''Scan xml for all elements and attributes
           populates dict with potential data colums
           filters if user_columns is not None'''
        columns = dict()
        for item in self._walk_file( self._in_file ):
            # now read the tags
            # attributes
            attr = item.attrib
            tag = item.tag

            # attributes are placed in columns as tag_attr_value
            if len( attr ) > 0:
                for a in attr:
                    if self._use_long_attr_name:
                        atag = '<' + tag + ' ' + a + '=' + "'" +attr[ a ] + "'>"
                    else:
                        atag = tag + '_' + a + '_' + attr[ a ]
                    if atag not in columns:
                        columns[ atag ] = 1
                    else:
                        columns[ atag ] += 1
            if tag not in columns:
                columns[ tag ] = 1
            else:
                columns[ tag ] += 1

        # TODO: filter on base tag, not attribute loaded
        # apply user filter
        if user_columns is not None:
            for c in columns.keys():
                if c.lower() not in user_columns:
                    del columns[ c ]
        return columns

def usage():
    '''Describes input arguments'''
    ex = os.path.basename( sys.argv[ 0 ] )
    if ex.endswith( '.py' ):
        ex = 'python ' + ex
    print '''
---XML Flattener---
usage:\t''' + ex + ''' -i in_file -o out_file [ options ]

Required arguments:
-i --in_file           : Path to input file
-o --out_file          : Path to output file

Optional arguments:
-h --help              : Display this message

-c --columns           : List of tags to include in the
                          output seperated by commas with
                          no spaces (  e.g., "item,price,qty"  )

-d --delimiter         : Delimeter character surrounded by "
                          and escaped (  e.g., "\\t", or "," ).
                          "TAB", "SPACE", and "COMMMA" keywords
                          may be used. To use "\\" as delimiter, enter "\\\\".

-l --line_ending_style : Line ending style ( e.g.,  "windows", "unix", "mac"  )

-r --row_break_tag     : Tag on which to create a new row in output file
                          If not selected, a new row will be made on most
                          frequent tag.

-n --long_attr_names   : If set, attribures shown in xml-like form in column
                          header, else form is "tag_attribute_value"
'''

def main( argv ):
    '''Entry point for script based usage
       requires -i and -o arguments'''
    try:
        # not using args
        opts, args = getopt.getopt( argv, 'i:o:hc:d:l:r:n', [ 'in_file=', 'out_file=',
         'help', 'columns=', 'delimiter=', 'line_ending_style=', 'row_break_tag=',
         'long_attr_names' ] )
    except getopt.GetoptError, err:
        print str( err )
        usage()
        sys.exit( 2 )

    if len( opts ) < 2 and '-h' not in opts and '--help' not in opts:
        usage()
        sys.exit( 2 )

    global WRITE_HEADER
    global LINE_ENDING_STYLE
    global DELIMITER
    global ROW_BREAK_TAG

    delimit_map = { 'SPACE': ' ', 'COMMA': ',', 'TAB': '\t', \
                    r'\t': '\t', r'\n': '\n', r'\c': '\c', r'\r': '\r' }

    # set defaults
    wh = WRITE_HEADER
    les = LINE_ENDING_STYLE
    d = DELIMITER
    rbt = ROW_BREAK_TAG
    lan = False
    in_file = None
    out_file = None
    columns = None

    # process commandline inputs, step on options where set
    for opt, arg in opts:
        if opt in ( '-h', '--help' ):
            usage()
            sys.exit()
        elif opt in ( '-c', '--columns' ):
            columns = arg
            columns = [ c.lower() for c in columns.split( ',' ) if len( c ) > 0 ]
        elif opt in ( '-r', '--row_break_tag' ):
            rbt = arg
        elif opt in ( '-d', '--delimiter' ):
            d = arg
            if '"' in d:
                d = arg[ 1:-1 ]
            if d in delimit_map:
                d = delimit_map[ d ]
        elif opt in ( '-l', '--line_ending_style' ):
            les = arg
        elif opt in ( '-n', '--long_attr_names' ):
            lan = True
        elif opt in ( '-i', '--in_file' ):
            in_file = arg
        elif opt in ( '-o', '--out_file' ):
            out_file = arg
        else:
            assert False, "Error: unhandled option"

    assert len( d ) == 1, 'Delimeter must be a single character'
    assert in_file is not None, 'Must enter an input file name'
    assert out_file is not None, 'must enter an output file name'
    # create instance of converter with default settings
    converter = XMLToDelimited( row_break_tag = rbt, line_ending_style = les,
        delimiter = d, write_header = wh , long_attr_name = lan)

    # use converter for files
    # default settings may be changed by creating a new converter instance
    res = converter( in_file, out_file, user_columns = columns )
    if res:
        print 'Successful conversion of %s to %s' % ( in_file, out_file )
    else:
        print 'Conversion of %s failed' % ( in_file )

if __name__ == '__main__':
    '''Command line run'''
    if len( sys.argv ) == 1:
        usage()
        print 'This is a commandline program only.\nPress RETURN to exit.'
        raw_input()
    else:
        main( sys.argv[ 1: ] )

