************************************************************************* * Copyright (c) 2007 by The Regents of the University of Michigan * * makeddl.sps * $Id: makeddl.sps,v 1.8 2009/08/13 19:22:36 overcash Exp $ * * Generates SDA DDL file and ASCII data directly from SPSS system * or portable file. * * Developed by: * Computing & Network Services * Inter-university Consortium for Political and Social Research (ICPSR) * Institute for Social Research (ISR) * University of Michigan * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND ICPSR, ISR, AND THE UNIVERSITY OF * MICHIGAN DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING * ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL * ICPSR, ISR, OR THE UNIVERSITY OF MICHIGAN BE LIABLE FOR ANY SPECIAL, * DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER * RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF * CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. * *************************************************************************. *---------------------------------------------------------------- * Suppress printback of submitted commands in SPSS output *----------------------------------------------------------------. set printback = none. ************************************************************************* * SYSTEM REQUIREMENTS: * * SPSS 15 or higher (Linux or Windows) * Python 2.4 or higher (with standard os, sys, string, re, and shutil * library modules) * SPSS-Python programmability plug-in (select download package * carefully -- specific to operating system and * version of SPSS) * SPSS-Python modules: spssaux, spssdata, extendedTransforms, * namedtuple, and trans * * SPSS-Python products and installation guide are freely downloadable * from SPSS Developer Central: * http://www.spss.com/devcentral * * Python is freely downloadable from: * http://www.python.org/download * ************************************************************************* * INSTALLATION OF PYTHON AND SPSS PRODUCTS: * * For detailed instructions, please refer to the SPSS programmability * installation guide specific to your operating system and version of SPSS. * * Python and the SPSS programmability plug-in must be installed first. * After that, the SPSS auxiliary modules (such as spssaux) should be * installed in the Python installation's "Lib\site-packages" directory, * e.g., * * C:\Python24\Lib\site-packages * ************************************************************************* * This syntax has been tested with the following versions of SPSS and * Python: * * Linux: SPSS Server 15.0 and Python 2.4.4 * Windows: SPSS 15.0.1 and Python 2.4.3 * * To use: * * (1) Edit lines 108-111 of this file to specify title and filenames. * This may be done in any text editor, including the SPSS Syntax * Editor in Windows. * * (2) Run in SPSS as a normal syntax file. * * Linux command line: * spssb -i -f makeddl.sps -type text -hide PTW * * Windows: * Run edited file from SPSS Syntax Editor * *************************************************************************. *---------------------------------------------------------------- * Python program block starts here. *----------------------------------------------------------------. BEGIN PROGRAM. #================================================================ # EDIT HERE: Specify dataset title, input and output filenames # # IMPORTANT: Be sure to use forward slashes ( / ), NOT back # slashes ( \ ) between directories in path specifications, e.g., # # asciidata_out = 'h:/mydir/sda/mydata.txt' # #================================================================ # USER-DEFINED SPECIFICATIONS: #=============================== title = 'SDA Dataset Title' spssdata_in = 'spssdata' # Must be .sav or .por asciidata_out = 'asciidata.txt' ddlfile_out = 'myddl.txt' #=============================== # DO NOT EDIT BELOW THIS LINE! #================================================================ # FUNCTIONS #================================================================ # Re-enable verbose output listing after running #---------------------------------------------------------------- def listing(): spss.Submit("set printback = listing.") #---------------------------------------------------------------- # Define cleanup function #---------------------------------------------------------------- def cleanup(): try: shutil.rmtree(tmpout) listing() except SystemError, detail: listing() raise Exception, 'Cannot delete ' + tmpout, detail #---------------------------------------------------------------- # Reformatting function #---------------------------------------------------------------- def reformat(name, n, oldtype, width, decimals, spssdata_in): dropname = 'ddd' + str(n) if oldtype == 'date': spss.Submit(r""" rename vars (%(name)s = ddd%(n)s). """ %locals()) transdate = trans.Tfunction() transdate.append(extendedTransforms.datetimetostr, name, 'A11', [dropname,const("%d-%b-%Y")]) transdate.execute() spss.Submit(r""" apply dictionary from = '%(spssdata_in)s' /source variable = %(name)s /target variable = %(name)s /varinfo all. """ %locals()) elif oldtype == 'numeric': spss.Submit(r""" format %(name)s (F%(width)s.%(decimals)s). """ %locals()) #================================================================ # MAIN SCRIPT #================================================================ # Import Python modules; string, re, os, sys, and shutil are # included in the standard Python download. #---------------------------------------------------------------- import spss, spssaux, extendedTransforms, trans, string, re, os, sys, shutil from trans import const #---------------------------------------------------------------- # Initialize variables #---------------------------------------------------------------- sig_digits = 18 # SDA limit is 18 significant digits name_chars = 16 # SDA variable name limit is 16 characters start = 1 # DDL column location errors = 0 pid = os.getpid() # Process ID #---------------------------------------------------------------- # Temporary output directory and filenames. Uses SPSSTMPDIR # environment variable #---------------------------------------------------------------- spsstmp = os.environ['SPSSTMPDIR'] tmpout = spsstmp + '/tmpout_' + str(pid) hdr_ddl = tmpout + '/hdr.ddl' tmp_ddl = tmpout + '/tmp.ddl' caseid_ddl = tmpout + '/caseid.ddl' allerrs = '\n' #---------------------------------------------------------------- # If specified SPSS file doesn't exist, exit #---------------------------------------------------------------- if os.path.exists(spssdata_in) == 0: nofile = "\n*** ERROR: Specified file (" + spssdata_in + ") doesn't exist. Exiting...\n" listing() raise Exception, nofile #---------------------------------------------------------------- # Generate SPSS file command based on filetype: # get file (.sav) # import file (.por) #---------------------------------------------------------------- basefn, ext = os.path.splitext(spssdata_in) if ext == '.sav': action = 'get' elif ext == '.por': action = 'import' else: listing() raise Exception, 'SPSS data file (' + spssdata_in + ') must have a .sav or .por extension\n' command = action + ' file="' + spssdata_in + '".' #---------------------------------------------------------------- # Specified SPSS file exists and is properly named; proceed. # Make temp directory called tmpout unless it already exists. #---------------------------------------------------------------- if os.path.exists(tmpout) == 0: try: os.mkdir(tmpout) except SystemError, detail: listing() raise Exception, 'Cannot mkdir tmpout: ', detail #---------------------------------------------------------------- # Open input data file using get/import file command #---------------------------------------------------------------- try: spss.Submit(command) except: cleanup() raise Exception, "Couldn't " + command #---------------------------------------------------------------- # Begin header info #---------------------------------------------------------------- hdrout = open(hdr_ddl, "w") hdrout.write('path = .\n') hdrout.write('title = ' + title + '\n') hdrout.write('records/case = 1\n') #---------------------------------------------------------------- # Temporary DDL variable output file #---------------------------------------------------------------- ddlout = open(tmp_ddl, "w") #---------------------------------------------------------------- # Regular expression for splitting SPSS formats into # type, width, and decimals (e.g., F8.2) #---------------------------------------------------------------- splitter = re.compile(r'^([A-Z]+)([0-9]+)\.*([0-9]+)*$') #---------------------------------------------------------------- # Loop through variables #---------------------------------------------------------------- for i in range(spss.GetVariableCount()): n = i + 1 myddl = ddlout #---------------------------------------------------------------- # Get field information for DDL file #---------------------------------------------------------------- name = spss.GetVariableName(i).upper() label = spss.GetVariableLabel(i) if n == 1: varlist = ' ' + name else: varlist = varlist + '\n ' + name #---------------------------------------------------------------- # SDA requires all variables to be formatted as numeric or # character only. #---------------------------------------------------------------- printformat = spss.GetVariableFormat(i) rawformat = splitter.search(printformat) formattype = rawformat.group(1) width = int(rawformat.group(2)) decimals = rawformat.group(3) if decimals == '': decimals = 0 #---------------------------------------------------------------- # Apply specified print format as write format to resolve # occasional print/write format discrepancies. The print format # is the value that is displayed in the SPSS Windows Data Editor. #---------------------------------------------------------------- spss.Submit(r""" write formats %(name)s (%(printformat)s). execute. """ %locals()) #---------------------------------------------------------------- # Automatically reformat non-alphanumeric variables if at all # possible. Applicable format types are listed in 'numericfmts', # 'charfmts', and 'datefmts' arrays. #---------------------------------------------------------------- numericfmts = [ 'F', 'CC', 'COMMA', 'DOLLAR', 'E', 'N', 'P', 'PCT' ] charfmts = [ 'A', 'MONTH', 'WKDAY', 'QYR', 'WKYR', 'DATETIME' ] datefmts = ['ADATE', 'DATE', 'DTIME', 'EDATE', 'JDATE', 'SDATE', 'TIME' ] vartype = 0 for fmt in numericfmts: if formattype == fmt: oldtype = 'numeric' vartype = 'numeric' mvaltag = 'md' if fmt == 'COMMA' or fmt == 'DOLLAR': reformat(name, n, oldtype, width, decimals, spssdata_in) break if vartype == 0: for fmt in charfmts: if formattype == fmt: vartype = 'character' mvaltag = 'md_c' break if vartype == 0: for fmt in datefmts: if formattype == fmt: oldtype = 'date' vartype = 'character' mvaltag = 'md_c' width = 11 reformat(name, n, oldtype, width, '0', spssdata_in) break #---------------------------------------------------------------- # User will need to reformat all other variables in SPSS #---------------------------------------------------------------- if vartype == 0: vartype = 'other' errmsg = '***ERROR: ' + name + ' is formatted as ' + formattype + '. SDA requires numeric or character formats.\n' errors = errors + 1 allerrs = allerrs + errmsg continue #---------------------------------------------------------------- # SDA limits variable names to 16 characters #---------------------------------------------------------------- namewidth = len(name) if namewidth > name_chars: errmsg = '***ERROR: Variable name ' + name + ' is longer than ' + str(name_chars) + ' characters\n' errors = errors + 1 allerrs = allerrs + errmsg continue #---------------------------------------------------------------- # SDA limits numeric variables to 18 significant digits #---------------------------------------------------------------- if vartype == 'numeric' and width > sig_digits: errmsg = '***ERROR: ' + name + ' has more than ' + str(sig_digits) + ' significant digits\n' errors = errors + 1 allerrs = allerrs + errmsg continue #---------------------------------------------------------------- # Save CASEID to be displayed first in the final DDL #---------------------------------------------------------------- if name == 'CASEID': caseidout = open(caseid_ddl, "w") myddl = caseidout #---------------------------------------------------------------- # Write out rest of temporary DDL file #---------------------------------------------------------------- myddl.write('*\nname = ' + name) myddl.write('\nlabel = ' + label) myddl.write('\ntype = ' + vartype) myddl.write('\ncolumn = ' + str(start)) myddl.write('\nwidth = ' + str(width)) if decimals > 0 and vartype == 'numeric': myddl.write('\ndecimals = ' + decimals) #---------------------------------------------------------------- # Translate SPSS missing value range indicators to SDA DDL syntax #---------------------------------------------------------------- missList = spssaux.GetMissingValues(i) if missList: if vartype == 'numeric': missList = string.replace(missList, "Lowest through ", "*-") missList = string.replace(missList, " through Highest", "-*") missList = string.replace(missList, " through ", "-") missList = string.replace(missList, ", and ", ", ") myddl.write('\n' + mvaltag + ' = ' + missList ) elif vartype == 'character': missList = re.sub("'", "''", missList) missList = re.sub('^"', "", missList) missList = re.sub('"$', '', missList) missList = re.sub('", "', "', '", missList) myddl.write('\n' + mvaltag + ' = ' + "'" + missList + "'" ) #---------------------------------------------------------------- # Value labels #---------------------------------------------------------------- catlabels = spssaux.GetValueLabels(i) if catlabels: myddl.write('\ncatlabels = \n') for lbl in sorted(catlabels): catlabel = catlabels[lbl] if vartype == 'character': lbl = string.replace(lbl, "'", "''") lbl = re.sub('^', "'", lbl) lbl = re.sub('$', "'", lbl) myddl.write(" " + lbl + " " + catlabel + "\n") else: myddl.write('\n') #---------------------------------------------------------------- # Calculate next start column based on width of variable just # added #---------------------------------------------------------------- start = start + int(width) if name == 'CASEID': myddl.close() myddl.close() #---------------------------------------------------------------- # Clean up and exit here if errors #---------------------------------------------------------------- if errors > 0: hdrout.close() cleanup() allerrs = allerrs + '\nExiting ...\n' raise ValueError, allerrs #---------------------------------------------------------------- # If CASEID doesn't exist in data file, create one #---------------------------------------------------------------- if os.path.exists(caseid_ddl) == 0: ccount = spss.GetCaseCount() casecount = str(ccount) cwidth = len( casecount ) cidwidth = str(cwidth) #---------------------------------------------------------------- # Compute variable #---------------------------------------------------------------- spss.Submit("compute CASEID = $casenum.") spss.Submit("format CASEID (F" + cidwidth + ".0).") spss.Submit("variable label CASEID 'CASE IDENTIFICATION NUMBER'.") spss.Submit("save outfile = '" + basefn + "_cid.sav'.") #---------------------------------------------------------------- # DDL definition #---------------------------------------------------------------- caseidout = open(caseid_ddl, "w") caseidout.write('*\nname = CASEID') caseidout.write('\nlabel = CASE IDENTIFICATION NUMBER') caseidout.write('\ntype = numeric') caseidout.write('\ncolumn = ' + str(start)) caseidout.write('\nwidth = ' + str(cwidth)) caseidout.write('\n') caseidout.close() start = start + cwidth varlist = varlist + '\n CASEID' #---------------------------------------------------------------- # Calculate LRECL and add to DDL header #---------------------------------------------------------------- reclen = start - 1 hdrout.write('reclen = ' + str(reclen) + '\n') hdrout.close() #---------------------------------------------------------------- # Combine DDL pieces into one #---------------------------------------------------------------- try: ddlout = open(ddlfile_out,'w') chunks = [hdr_ddl, caseid_ddl, tmp_ddl] for i in range(len(chunks)): ddlout.write( open(chunks[i]).read() ) ddlout.close() except SystemError, detail: cleanup() raise Exception, 'Cannot combine DDL files: ', detail #---------------------------------------------------------------- # Write out ASCII data file that corresponds to the columns # specified in the DDL. #---------------------------------------------------------------- try: spss.Submit("write outfile='" + asciidata_out + "' table / \n" + varlist + ".") spss.Submit("execute.") except: cleanup() raise Exception, "Couldn't write out ASCII data (" + asciidata_out + ")" #---------------------------------------------------------------- # Successful run; clean up and exit Python program block. #---------------------------------------------------------------- cleanup() print "-------------------------------\n" print "Done! Conversion successful\n" print "-------------------------------\n" END PROGRAM.