*************************************************************************
* Copyright (c) 2007 by The Regents of the University of Michigan
*
* makeddl.sps
* $Id: makeddl.sps,v 1.8 2009/08/13 19:22:36 overcash Exp $
*
* Generates SDA DDL file and ASCII data directly from SPSS system
* or portable file.
*
* Developed by:
*   Computing & Network Services
*   Inter-university Consortium for Political and Social Research (ICPSR)
*   Institute for Social Research (ISR)
*   University of Michigan
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND ICPSR, ISR, AND THE UNIVERSITY OF
* MICHIGAN DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING
* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL
* ICPSR, ISR, OR THE UNIVERSITY OF MICHIGAN BE LIABLE FOR ANY SPECIAL,
* DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER
* RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
* CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*
*************************************************************************.

*----------------------------------------------------------------
* Suppress printback of submitted commands in SPSS output
*----------------------------------------------------------------.
set printback = none.

*************************************************************************
* SYSTEM REQUIREMENTS:
*
*    SPSS 15 or higher (Linux or Windows)
*    Python 2.4 or higher (with standard os, sys, string, re, and shutil
*                library modules)
*    SPSS-Python programmability plug-in (select download package
*                carefully -- specific to operating system and
*                version of SPSS)
*    SPSS-Python modules:  spssaux, spssdata, extendedTransforms,
*                namedtuple, and trans
*
* SPSS-Python products and installation guide are freely downloadable
* from SPSS Developer Central:
*    http://www.spss.com/devcentral
*
* Python is freely downloadable from:
*    http://www.python.org/download
*
*************************************************************************
* INSTALLATION OF PYTHON AND SPSS PRODUCTS:
*
* For detailed instructions, please refer to the SPSS programmability
* installation guide specific to your operating system and version of SPSS.
*
* Python and the SPSS programmability plug-in must be installed first.
* After that, the SPSS auxiliary modules (such as spssaux) should be
* installed in the Python installation's "Lib\site-packages" directory,
* e.g.,
*
*    C:\Python24\Lib\site-packages
*
*************************************************************************
* This syntax has been tested with the following versions of SPSS and
* Python:
*
*    Linux:    SPSS Server 15.0 and Python 2.4.4
*    Windows:  SPSS 15.0.1 and Python 2.4.3
*
* To use:
*
*    (1) Edit lines 108-111 of this file to specify title and filenames.
*        This may be done in any text editor, including the SPSS Syntax
*        Editor in Windows.
*
*    (2) Run in SPSS as a normal syntax file.
*
*           Linux command line:
*               spssb -i -f makeddl.sps -type text -hide PTW
*
*           Windows:
*               Run edited file from SPSS Syntax Editor
*
*************************************************************************.

*----------------------------------------------------------------
* Python program block starts here.
*----------------------------------------------------------------.
BEGIN PROGRAM.

#================================================================
# EDIT HERE:  Specify dataset title, input and output filenames
#
# IMPORTANT:  Be sure to use forward slashes ( / ), NOT back
# slashes ( \ ) between directories in path specifications, e.g.,
#
#    asciidata_out = 'h:/mydir/sda/mydata.txt'
#
#================================================================
# USER-DEFINED SPECIFICATIONS:
#===============================

title = 'SDA Dataset Title'
spssdata_in = 'spssdata'            # Must be .sav or .por
asciidata_out = 'asciidata.txt'
ddlfile_out = 'myddl.txt'

#===============================
# DO NOT EDIT BELOW THIS LINE!
#================================================================
# FUNCTIONS
#================================================================
# Re-enable verbose output listing after running
#----------------------------------------------------------------
def listing():
        spss.Submit("set printback = listing.")

#----------------------------------------------------------------
# Define cleanup function
#----------------------------------------------------------------
def cleanup():
        try:
                shutil.rmtree(tmpout)
                listing()
        except SystemError, detail:
                listing()
                raise Exception, 'Cannot delete ' + tmpout, detail

#----------------------------------------------------------------
# Reformatting function
#----------------------------------------------------------------
def reformat(name, n, oldtype, width, decimals, spssdata_in):
        dropname = 'ddd' + str(n)

        if oldtype == 'date':
                spss.Submit(r"""
                rename vars (%(name)s = ddd%(n)s).
                """ %locals())

                transdate = trans.Tfunction()
                transdate.append(extendedTransforms.datetimetostr,
                             name,
                             'A11',
                             [dropname,const("%d-%b-%Y")])
                transdate.execute()

                spss.Submit(r"""
                apply dictionary from = '%(spssdata_in)s'
                        /source variable = %(name)s
                        /target variable = %(name)s
                        /varinfo all.
                """ %locals())

        elif oldtype == 'numeric':
                spss.Submit(r"""
                format %(name)s (F%(width)s.%(decimals)s).
        """ %locals())

#================================================================
# MAIN SCRIPT
#================================================================
# Import Python modules; string, re, os, sys, and shutil are
# included in the standard Python download.
#----------------------------------------------------------------
import spss, spssaux, extendedTransforms, trans, string, re, os, sys, shutil
from trans import const

#----------------------------------------------------------------
# Initialize variables
#----------------------------------------------------------------
sig_digits = 18              # SDA limit is 18 significant digits
name_chars = 16              # SDA variable name limit is 16 characters
start = 1                    # DDL column location
errors = 0
pid = os.getpid()            # Process ID

#----------------------------------------------------------------
# Temporary output directory and filenames.  Uses SPSSTMPDIR
# environment variable
#----------------------------------------------------------------
spsstmp = os.environ['SPSSTMPDIR']
tmpout = spsstmp + '/tmpout_' + str(pid)

hdr_ddl = tmpout + '/hdr.ddl'
tmp_ddl = tmpout + '/tmp.ddl'
caseid_ddl = tmpout + '/caseid.ddl'
allerrs = '\n'

#----------------------------------------------------------------
# If specified SPSS file doesn't exist, exit
#----------------------------------------------------------------
if os.path.exists(spssdata_in) == 0:
        nofile = "\n*** ERROR:  Specified file (" + spssdata_in + ") doesn't exist. Exiting...\n"
        listing()
        raise Exception, nofile

#----------------------------------------------------------------
# Generate SPSS file command based on filetype:
#    get file (.sav)
#    import file (.por)
#----------------------------------------------------------------
basefn, ext = os.path.splitext(spssdata_in)

if ext == '.sav':
        action = 'get'
elif ext == '.por':
        action = 'import'
else:
        listing()
        raise Exception, 'SPSS data file (' + spssdata_in + ') must have a .sav or .por extension\n'

command = action + ' file="' + spssdata_in + '".'

#----------------------------------------------------------------
# Specified SPSS file exists and is properly named; proceed.
# Make temp directory called tmpout unless it already exists.
#----------------------------------------------------------------
if os.path.exists(tmpout) == 0:
        try:
                os.mkdir(tmpout)
        except SystemError, detail:
                listing()
                raise Exception, 'Cannot mkdir tmpout: ', detail

#----------------------------------------------------------------
# Open input data file using get/import file command
#----------------------------------------------------------------
try:
        spss.Submit(command)
except:
        cleanup()
        raise Exception, "Couldn't " + command

#----------------------------------------------------------------
# Begin header info
#----------------------------------------------------------------
hdrout = open(hdr_ddl, "w")
hdrout.write('path         = .\n')
hdrout.write('title        = ' + title + '\n')
hdrout.write('records/case = 1\n')

#----------------------------------------------------------------
# Temporary DDL variable output file
#----------------------------------------------------------------
ddlout = open(tmp_ddl, "w")

#----------------------------------------------------------------
# Regular expression for splitting SPSS formats into
# type, width, and decimals (e.g., F8.2)
#----------------------------------------------------------------
splitter = re.compile(r'^([A-Z]+)([0-9]+)\.*([0-9]+)*$')

#----------------------------------------------------------------
# Loop through variables
#----------------------------------------------------------------
for i in range(spss.GetVariableCount()):
        n = i + 1
        myddl = ddlout

        #----------------------------------------------------------------
        # Get field information for DDL file
        #----------------------------------------------------------------
        name = spss.GetVariableName(i).upper()
        label =  spss.GetVariableLabel(i)

        if n == 1:
                varlist = '   ' + name
        else:
                varlist = varlist + '\n   ' + name

        #----------------------------------------------------------------
        # SDA requires all variables to be formatted as numeric or
        # character only.
        #----------------------------------------------------------------
        printformat = spss.GetVariableFormat(i)
        rawformat = splitter.search(printformat)
        formattype = rawformat.group(1)
        width = int(rawformat.group(2))
        decimals = rawformat.group(3)
        if decimals == '':
                decimals = 0

        #----------------------------------------------------------------
        # Apply specified print format as write format to resolve
        # occasional print/write format discrepancies.  The print format
        # is the value that is displayed in the SPSS Windows Data Editor.
        #----------------------------------------------------------------
        spss.Submit(r"""
                write formats %(name)s (%(printformat)s).
                execute.
                """ %locals())

        #----------------------------------------------------------------
        # Automatically reformat non-alphanumeric variables if at all
        # possible.  Applicable format types are listed in 'numericfmts',
        # 'charfmts', and 'datefmts' arrays.
        #----------------------------------------------------------------
        numericfmts = [ 'F', 'CC', 'COMMA', 'DOLLAR', 'E', 'N', 'P', 'PCT' ]
        charfmts = [ 'A', 'MONTH', 'WKDAY', 'QYR', 'WKYR', 'DATETIME' ]
        datefmts = ['ADATE', 'DATE', 'DTIME', 'EDATE', 'JDATE', 'SDATE', 'TIME' ]

        vartype = 0
        for fmt in numericfmts:
                if formattype == fmt:
                        oldtype = 'numeric'
                        vartype = 'numeric'
                        mvaltag = 'md'
                        if fmt == 'COMMA' or fmt == 'DOLLAR':
                                reformat(name, n, oldtype, width, decimals, spssdata_in)
                        break

        if vartype == 0:
                for fmt in charfmts:
                        if formattype == fmt:
                                vartype = 'character'
                                mvaltag = 'md_c'
                                break

        if vartype == 0:
                for fmt in datefmts:
                        if formattype == fmt:
                                oldtype = 'date'
                                vartype = 'character'
                                mvaltag = 'md_c'
                                width = 11
                                reformat(name, n, oldtype, width, '0', spssdata_in)
                                break

        #----------------------------------------------------------------
        # User will need to reformat all other variables in SPSS
        #----------------------------------------------------------------
        if vartype == 0:
                vartype = 'other'
                errmsg = '***ERROR: ' + name + ' is formatted as ' + formattype + '. SDA requires numeric or character formats.\n'
                errors = errors + 1
                allerrs = allerrs + errmsg
                continue

        #----------------------------------------------------------------
        # SDA limits variable names to 16 characters
        #----------------------------------------------------------------
        namewidth = len(name)

        if namewidth > name_chars:
                errmsg = '***ERROR: Variable name ' + name + ' is longer than ' + str(name_chars) + ' characters\n'
                errors = errors + 1
                allerrs = allerrs + errmsg
                continue

        #----------------------------------------------------------------
        # SDA limits numeric variables to 18 significant digits
        #----------------------------------------------------------------
        if vartype == 'numeric' and width > sig_digits:
                errmsg = '***ERROR: ' + name + ' has more than ' + str(sig_digits) + ' significant digits\n'
                errors = errors + 1
                allerrs = allerrs + errmsg
                continue

        #----------------------------------------------------------------
        # Save CASEID to be displayed first in the final DDL
        #----------------------------------------------------------------
        if name == 'CASEID':
                caseidout = open(caseid_ddl, "w")
                myddl = caseidout

        #----------------------------------------------------------------
        # Write out rest of temporary DDL file
        #----------------------------------------------------------------
        myddl.write('*\nname = ' + name)
        myddl.write('\nlabel = ' + label)
        myddl.write('\ntype = ' + vartype)
        myddl.write('\ncolumn = ' + str(start))
        myddl.write('\nwidth = ' + str(width))

        if decimals > 0 and vartype == 'numeric':
                myddl.write('\ndecimals = ' + decimals)

        #----------------------------------------------------------------
        # Translate SPSS missing value range indicators to SDA DDL syntax
        #----------------------------------------------------------------
        missList = spssaux.GetMissingValues(i)

        if missList:
                if vartype == 'numeric':
                        missList = string.replace(missList, "Lowest through ", "*-")
                        missList = string.replace(missList, " through Highest", "-*")
                        missList = string.replace(missList, " through ", "-")
                        missList = string.replace(missList, ", and ", ", ")
                        myddl.write('\n' + mvaltag + ' = ' + missList )
                elif vartype == 'character':
                        missList = re.sub("'", "''", missList)
                        missList = re.sub('^"', "", missList)
                        missList = re.sub('"$', '', missList)
                        missList = re.sub('", "', "', '", missList)
                        myddl.write('\n' + mvaltag + ' = ' + "'" + missList + "'"  )

        #----------------------------------------------------------------
        # Value labels
        #----------------------------------------------------------------
        catlabels = spssaux.GetValueLabels(i)

        if catlabels:
              myddl.write('\ncatlabels = \n')

              for lbl in sorted(catlabels):
                        catlabel = catlabels[lbl]

                        if vartype == 'character':
                                lbl = string.replace(lbl, "'", "''")
                                lbl = re.sub('^', "'", lbl)
                                lbl = re.sub('$', "'", lbl)

                        myddl.write("            " + lbl + " " + catlabel + "\n")
        else:
                myddl.write('\n')

        #----------------------------------------------------------------
        # Calculate next start column based on width of variable just
        # added
        #----------------------------------------------------------------
        start = start + int(width)

        if name == 'CASEID':
                myddl.close()
myddl.close()

#----------------------------------------------------------------
# Clean up and exit here if errors
#----------------------------------------------------------------
if errors > 0:
        hdrout.close()
        cleanup()
        allerrs = allerrs + '\nExiting ...\n'
        raise ValueError, allerrs

#----------------------------------------------------------------
# If CASEID doesn't exist in data file, create one
#----------------------------------------------------------------
if os.path.exists(caseid_ddl) == 0:
        ccount = spss.GetCaseCount()
        casecount = str(ccount)
        cwidth = len( casecount )
        cidwidth = str(cwidth)

        #----------------------------------------------------------------
        # Compute variable
        #----------------------------------------------------------------
        spss.Submit("compute CASEID = $casenum.")
        spss.Submit("format CASEID (F" + cidwidth + ".0).")
        spss.Submit("variable label CASEID 'CASE IDENTIFICATION NUMBER'.")
        spss.Submit("save outfile = '" + basefn + "_cid.sav'.")

        #----------------------------------------------------------------
        # DDL definition
        #----------------------------------------------------------------
        caseidout = open(caseid_ddl, "w")
        caseidout.write('*\nname = CASEID')
        caseidout.write('\nlabel = CASE IDENTIFICATION NUMBER')
        caseidout.write('\ntype = numeric')
        caseidout.write('\ncolumn = ' + str(start))
        caseidout.write('\nwidth = ' + str(cwidth))
        caseidout.write('\n')
        caseidout.close()
        start = start + cwidth
        varlist = varlist + '\n   CASEID'

#----------------------------------------------------------------
# Calculate LRECL and add to DDL header
#----------------------------------------------------------------
reclen = start - 1
hdrout.write('reclen       = ' + str(reclen) + '\n')
hdrout.close()

#----------------------------------------------------------------
# Combine DDL pieces into one
#----------------------------------------------------------------
try:
        ddlout = open(ddlfile_out,'w')

        chunks = [hdr_ddl, caseid_ddl, tmp_ddl]

        for i in range(len(chunks)):
             ddlout.write( open(chunks[i]).read() )
        ddlout.close()

except SystemError, detail:
        cleanup()
        raise Exception, 'Cannot combine DDL files: ', detail

#----------------------------------------------------------------
# Write out ASCII data file that corresponds to the columns
# specified in the DDL.
#----------------------------------------------------------------
try:
        spss.Submit("write outfile='" + asciidata_out + "' table / \n" + varlist + ".")
        spss.Submit("execute.")

except:
        cleanup()
        raise Exception, "Couldn't write out ASCII data (" + asciidata_out + ")"

#----------------------------------------------------------------
# Successful run; clean up and exit Python program block.
#----------------------------------------------------------------
cleanup()

print "-------------------------------\n"
print "Done!  Conversion successful\n"
print "-------------------------------\n"

END PROGRAM.