AAII SIPRO - Perl script to generate CSV files

Below is a (old) perl script I wrote to extract and merge data from the various data tables in SIPRO into simpler to process CSV files:

#!/usr/bin/perl
use strict;
use warnings;
use FileHandle;
use XBase;
use Data::Dumper;

my $SEPARATOR = "\t";
my $DATAFILEPREFIX = "si";
my $DATAFILEEXT = "csv";

# First, figure out what dates this dataset covers. We use
# the three dates in setup.dbf for this (month, week, split):
my $table = new XBase 'name' => windows_name ('setup.dbf')
    or die "error opening setup.dbf\n";
my @dates = $table->get_record (0);

# Figure out database layout
my $dblayout;
my $execstr = "dbfdump --info " . windows_name ('setup.dbf');
my $resultstr = `$execstr`;
if (($resultstr =~ /1\.\tMONTHDATE/m) &&
    ($resultstr =~ /2\.\tWEEKDATE/m) &&
    ($resultstr =~ /3\.\tSPLITDATE/m)) {
    $dblayout = 201001;
} elsif (($resultstr =~ /4\.\tDATADATE/) &&
	 ($resultstr =~ /5\.\tPRICEDATE/) &&
	 ($resultstr =~ /6\.\tSPLITDATE/)) {
    my @newdates;
    # Set price date to datadate if price date is missing:
    $dates[5] = $dates[4]
	unless ($dates[5] && ($dates[5] ne ''));
    @newdates = (0, @dates[4..6]);
    @dates = @newdates;
    $dblayout = 200003;
} elsif (($resultstr =~ /4\.\tDATADATE/) &&
	 ($resultstr =~ /5\.\tPRICE_YEAR/)) {
    @dates = (0, $dates[4], $dates[4], $dates[4]);
    $dblayout = 199901;
} else {
    print "error - unknown setup.dbf format";
}

if (($dblayout != 201001) &&
    ($dblayout != 200003) &&
    ($dblayout != 199901)) {
    print "error - unknown dblayout";
    exit;
}

if (@ARGV && ($ARGV[0] eq '--info')) {
    print join ('-', @dates[1..$#dates]);
    exit;
}

# Generate output filenames based on the dates in setup.dbf:
my $timestamp = join ('-', @dates[1..$#dates]);
my $DATAFILENAME = "$DATAFILEPREFIX-$timestamp";
my $numdatfilename = "$DATAFILENAME-numdat.$DATAFILEEXT";
my $vardatfilename = "$DATAFILENAME-vardat.$DATAFILEEXT";
my $datadictfilename = "$DATAFILENAME-dictdt.$DATAFILEEXT";
my $drpsfilename = "$DATAFILENAME-drpsdt.$DATAFILEEXT";
my $sectorfilename = "$DATAFILENAME-secavg.$DATAFILEEXT";
my $industryfilename = "$DATAFILENAME-indavg.$DATAFILEEXT";

print STDERR "Checking: $numdatfilename ", `pwd`;
if (-s $numdatfilename) {
    print STDERR "sipro2csv skipped - files ($numdatfilename) exist\n";
    exit;
}

# These tables either do not use# the COMPANY_ID field. 
# Dump them "as is"/verbatim.

my %DUMPSIMPLETABLES = ($datadictfilename => 'datadict.dbf', 
			$drpsfilename => 'drps.dbf',
			$sectorfilename => 'si_mgavg.dbf',
			$industryfilename => 'si_mgav2.dbf');

# These tables contain the company specific data (they use COMPANY_ID
# to tie everything together). The connection between COMPANY_ID and
# TICKER is found in si_ci.dbf.
my @tablenames = qw (si_ci.dbf si_isq.dbf si_bsq.dbf si_isa.dbf
                     si_bsa.dbf si_psdc.dbf si_psdh.dbf si_psdl.dbf si_psdd.dbf
                     si_rat.dbf si_perc.dbf si_gr.dbf si_mlt.dbf si_cfq.dbf
                     si_psd.dbf si_cfa.dbf si_ee.dbf si_psda.dbf si_val.dbf
                     si_date.dbf);

if ($dblayout > 200003) {
    push @tablenames, 'si_psdv.dbf';
}

# Verify that all the files can be found, or exit.
files_exists (@tablenames)
    or die;
files_exists (values %DUMPSIMPLETABLES)
    or die;

# xbase data types are:
# C - character
# M - memo
# D - date YYYYMMDD
# N - number
# L - logical/boolean (Y,y,T,t = true, N,n,F,f = false)
# 0 - NULLFLAGS, just an extra XBase specific field which we currently do not use.

print STDERR "Will output to the following files:\n";
print STDERR "  $datadictfilename\n";
print STDERR "  $drpsfilename\n";
print STDERR "  $sectorfilename\n";
print STDERR "  $industryfilename\n";
print STDERR "  $vardatfilename\n";
print STDERR "  $numdatfilename\n";
print STDERR "  starting in 5 seconds ";

# Give the user some time to abort the process if something looks wrong:
for (1..5) {
    print STDERR ".";
    sleep (1);
}
print STDERR "\n";

# Record time usage
my $time_start = time;

# Start with dumping the tables that do not need to be joined immediately with
# other tables (these are the tables that will be output to separate files):
foreach my $file (keys %DUMPSIMPLETABLES) {
    if (-s $file) {
	print STDERR "Skipping: $file (file exists)\n";
    } else {
    print STDERR "Writing: $file\n";
    open (OUTPUT, ">$file");
    $table = new XBase 'name' => windows_name ($DUMPSIMPLETABLES{$file})
	or die "error opening $DUMPSIMPLETABLES{$file} ($!)";
    my $tablename = uc $DUMPSIMPLETABLES{$file};
    $tablename =~ s/\.DBF//;
    my @colnames = $table->field_names;
    my @coltypes = $table->field_types;
    my @collengths = $table->field_lengths;
    my @coldecimals = $table->field_decimals;
    # Get rid of last item, i.e. NULLFLAGS entry (XBase specific stuff)
    pop @colnames;
    pop @coltypes;
    pop @collengths;
    pop @coldecimals;
    print STDERR "  (headers: ", $#colnames+1, ")\n";
    my @output = ();
    for (my $i=0; $i prepare_select;
    while (my @data = $cursor->fetch) {
	pop @data;
	$numrows++;
	foreach my $data (@data) {
	    $data = ""
		unless defined ($data);
	    # Clean up data by removing leading blanks, zeros etc..
	    $data =~ s/^\s+//mg;
	    $data =~ s/\s+$//mg;
		# Make sure data does not contain the separator
		$data =~ s/$SEPARATOR+//mg;
	}
	if ($oneshot) {
	    print STDERR "  (data   : ", $#data+1, ")\n";
	    $oneshot = 0;
	}
	print OUTPUT join ($SEPARATOR, @data), "\n";
    }
    print STDERR "  (numrows: $numrows)\n";
    close (OUTPUT);
    }
}

=pod

Ok, trivial files have been extracted. Next we are going to merge related
items from different dbfs into one table. Simultaneously, we are going to
store all variable length data in a separate file since most of this data
is "informational only" and not used for actual analysis.

My algorithm for doing this is:

1. Open "si_ci.dbf"
2. For each record in si_ci.dbf:
  2.1 Write variable length fields to "vardat".
  2.2 Lookup all related fields in all dbfs using COMPANY_ID and write it
      to "numdat".

=cut


# Names of fields to be moved into "vardat", i.e. all variable length in si_ci.dbf:
my %vardatnames = (
		   COMPANY => 1,
		   TICKER => 1,
		   STREET => 1,
		   CITY => 1,
		   STATE => 1,
		   ZIP => 1,
#		   COUNTRY => 1,
		   PHONE => 1,
		   WEB_ADDR => 1,
		   BUSINESS => 1,
		   ANALYST_FN => 1,
		  );
# If field name in si_ci.dbf does _not_ match any of the names above, it will be
# assumed to be a fixed length datatype and written to "numdat".

# Hash to hold all the XBase objects instantiated:
my (%tables, %tablecache);

# Hash to hold the field definitions of vardat and numdat respectively:
my (@vardat, @numdat);

# Open up si_ci.dbf:
$tables{lc $tablenames[0]} = new XBase 'name' => windows_name ($tablenames[0])
    or die XBase->errstr;

# Get field definitions and set up proper data structures in vardat/numdat
# hashes:
my @colnames = $tables{lc $tablenames[0]}->field_names ();
my @coltypes = $tables{lc $tablenames[0]}->field_types ();
my @collengths = $tables{lc $tablenames[0]}->field_lengths ();
my @coldecimals = $tables{lc $tablenames[0]}->field_decimals ();

# Move the proper headings to vardat/numdat respectively. Skip the very
# since it contains NULLFLAGS (some XBase specific field we do not use).
# Also assumes that the first field in si_ci.dbf is always COMPANY_ID.
for (my $i=0; $i  windows_name ($tablename)
	or die XBase->errstr;
    my $savetablename = uc $tablename;
    $savetablename =~ s/\.DBF//;

    my @tmpcolnames = $tables{lc $tablename}->field_names;
    my @tmpcoltypes = $tables{lc $tablename}->field_types;
    my @tmpcollengths = $tables{lc $tablename}->field_lengths;
    my @tmpcoldecimals = $tables{lc $tablename}->field_decimals;
    # Add all columns except the first (COMPANY_ID) and the last (NULLFLAGS)
    # to the @numdat:
    for (my $i=1; $i {colhdr}->{$colname} = [$pos, $length, $binpackstr];
    $pos += $length;
}
$binhash->{bytesperrow} = $pos;

print STDERR "binpackbytesperrow: $pos\n";
#print STDERR "binpackstr: $binpackstr\n";

# If we get here, we can assume we have some nice headers in @vardat/@numdat
# respectively. This is a good time to open the files and output the headers
# of at least @vardat (we assume that si_ci.dbf is the only table that contain
# variable type data):

open (VARDAT, ">$vardatfilename")
    or die "error creating $vardatfilename";

open (NUMDAT, ">$numdatfilename")
    or die "error creating $numdatfilename";

#open (BINDAT, ">test.bin");

print STDERR "Writing: $vardatfilename\n";

print STDERR "  (vardat headers: ", $#vardat+1, ")\n";

print VARDAT join ($SEPARATOR, @vardat), "\n"
    or die "error writing $vardatfilename";

print STDERR "Writing: $numdatfilename\n";

print STDERR "  (numdat headers: ", $#numdat+1, ")\n";

print NUMDAT join ($SEPARATOR, @numdat), "\n"
    or die "error writing $numdatfilename";

# Now for the data itself; open si_ci.dbf and iterate through each record.
# Based on the COMPANY_ID field, figure out all related columns and output
# everything into one row in @numdat with the same COMPANY_ID. Similar,
# save the variable length fields in @vardat.

my $activate_once = 1;
my $hourglass = 99999999999;
my $numrows = 0;
my $si_ci_cursor = $tables{lc $tablenames[0]}->prepare_select;
while (my @si_ci_data = $si_ci_cursor->fetch ()) {
    my (@varrow, @numrow);
    my $company_id = $si_ci_data[0];
    $numrows++;
    #print STDERR "  (company_id is $company_id)\n";
    push @varrow, $company_id;
    push @numrow, $company_id;
    # All columns except COMPANY_ID (assumed to be at the zero position):
    for (my $i=1; $i prepare_select ("COMPANY_ID");
			while (my @data = $cursor->fetch ()) {
				my $recordid = $cursor->last_fetched;
				$tablecache{lc $tablename}->{$data[0]} = $recordid;
			}
		}
		if (defined ($tablecache{lc $tablename}->{$company_id})) {
			my @tmpdata = $tables{lc $tablename}->get_record ($tablecache{lc $tablename}->{$company_id});
			if (@tmpdata) {
				if (si_data_bugs ($dates[1], $varrow[$var_nametocol{TICKER}])) {
					# If data has bugs in it, pretend the record is deleted to avoid
					# it being added to the dataset.
					$tmpdata[0] = 1;
					$missingcompanyid = 1;
				}

				# Ok, we have a data row in @data. The first column contains the DELETED flag which
				# we need to verify (to avoid working with deleted data).
				if ($tmpdata[0] != 1) {
					# Ok, row is not deleted. Also skip first (COLUMN_ID) and last value 
					# as usual (NULLFLAGS):
					for (my $j=2; $j  100) {
			print STDERR ".";
			$hourglass = 0;
		}
		# Output data to their respective files:
		print VARDAT join ($SEPARATOR, @varrow), "\n";
		print NUMDAT join ($SEPARATOR, @numrow), "\n";
		if (0) {
			# Create test binary file:
			my $binstr = '';
			for (my $i=0; $i {colhdr}->{$colname}};
				my $value = $numrow[$i];
				# Blank out DEBT_RATE
				#if ($colname eq 'DEBT_RATE') {
				#    $value = "";
				#}
				if ($binpackstr eq 'f1') {
					if ($value eq '') {
						$binstr .= "\0\0\xC0\x7F";
					} else {
						$binstr .= pack ($binpackstr, $value);
					}
				} else {
					$binstr .= pack ($binpackstr, $value);
				}
				my $companyid_col = $num_nametocol{COMPANY_ID};
				my $ticker_col = $var_nametocol{TICKER};
				#print STDERR "companyidcol $companyid_col ticker_col $ticker_col\n";
				$binhash->{companyid_blockno}->{$numrow[$companyid_col]} =
					[$numrows-1, $varrow[$ticker_col]];
			}
			#print BINDAT $binstr;
		}
	}
}

# Close output files
close (NUMDAT);
close (VARDAT);

#close (BINDAT);
#open (BINDAT, ">test.bix");
#print BINDAT Data::Dumper->Dump ([$binhash], [qw(binhash)]);
#close (BINDAT);

# Output runtime statistics
print STDERR "\n  (numrows: $numrows)\n";
my $time_total = time - $time_start;
if ($time_total > 0) {
    printf STDERR "Total runtime: %d seconds (%.2f rows/sec)\n", $time_total, ($numrows / $time_total);
}

sub si_data_bugs {
	my ($date, $ticker) = @_;
	#print STDERR "v $value d $date t $ticker cn $colname\n";
	if (($date eq '20040924') && ($ticker eq 'GBCS')) {
		print STDERR "\n$date: Skipping $ticker due to bugs in SI data";
		return 1;
	}
}

# Verifies that all files given exists. Return false if not.
sub files_exists {
    my $ok = 1;
    foreach my $filename (@_) {
	if (!windows_name ($filename)) {
	    print STDERR "File not found: $filename\n";
	    undef $ok;
	}
    }
    return $ok;
}

# Attempts to open a filename with no regard for case sensitivity.
# Assumes first attempt is in ALL LOWER CASE.
sub windows_name {
    my $filename = shift;
    my $orgfilename = $filename;
    for (my $attempt=0; $attempt

Add new comment

Filtered HTML

  • Web page addresses and e-mail addresses turn into links automatically.
  • Syntax highlight code surrounded by the {syntaxhighlighter SPEC}...{/syntaxhighlighter} tags, where SPEC is a Syntaxhighlighter options string or class="OPTIONS" [title="the title"].
  • Allowed HTML tags: <a> <em> <strong> <cite> <blockquote> <code> <ul> <ol> <li> <dl> <dt> <dd> <h2> <h3>
  • Lines and paragraphs break automatically.

Plain text

  • No HTML tags allowed.
  • Web page addresses and e-mail addresses turn into links automatically.
  • Lines and paragraphs break automatically.
CAPTCHA
This question is for testing whether you are a human visitor and to prevent automated spam submissions.