Chapter 8: File Processing (Solutions to Even-Numbered Exercises)

Question 2

def read_weather_data(r):
    '''Read weather data from reader r in fixed-width format.  
    The field widths are:
        4,2,2   YYYYMMDD (date)
        2,2,2   DDMMSS   (latitude)
        2,2,2   DDMMSS   (longitude)
        6,6,6   FF.FFF   (temp, deg. C; humidity, %; pressure, kPa)
    The result is a list of tuples:
    ((YY, MM, DD), (DD, MM, SS), (DD, MM, SS), (Temp, Hum, Press))'''

    fields = (((4, int), (2, int), (2, int)),       # date
              ((2, int), (2, int), (2, int)),       # latitude
              ((2, int), (2, int), (2, int)),       # longitude
              ((6, float), (6, float), (6, float))) # data
    result = []
    # For each record
    for line in r:
        start = 0
        record = []
        # for each group of fields in the record
        for group in fields:

            # for each field in the record
            values = []
            for (width, target_type) in group:
                # convert the text
                text = line[start:start+width]
                field = target_type(text)
                # add it to the record
                values.append(field)
                # move on
                start += width

            # add these values to the record
            record.append(tuple(values))

        # add the completed record to the result
        result.append(record)
    return result

A tuple of tuples is easier to work with because values have been grouped into logical chunks. To get the longitude, for example, the programmer would get the third element of the record, then get its parts, rather than having to count along to get the seventh, eighth, and ninth values.

Question 4

import sys
from tsdl import skip_header

def skip_header(r):
    '''Skip the header in reader r, and return the first
    real piece of data.'''

    # Read the description line (which must be present) and then
    # any comment lines that are also present.
    line = r.readline()
    line = r.readline()
    while line and line.startswith('#'):
        line = r.readline()
    
    # Now line contains the first real piece of data, or an empty
    # string if there was no data.
    return line

def smallest_value_skip(r, default=0):
    '''Read and process reader r to find the smallest value after 
    the TSDL header.  Skip missing values, which are indicated 
    with a hyphen.'''

    line = skip_header(r)
    if not line:
        return default

    # Now line contains the first data value; this is also the
    # smallest value found so far.
    smallest = int(line.strip())

    for line in r:
        line = line.strip()

        # Only process line if it has a valid value.
        if line != '-':
            value = int(line)

            # Process value; if we find a smaller value, remember it.
            if value < smallest:
                smallest = value

    return smallest

if __name__ == "__main__":
    input_file = open(sys.argv[1], "r")
    print smallest_value_skip(input_file)
    input_file.close()

Question 6

def get_line(r):
    '''Return the next interesting line from the reader, or an empty
    string if there are no more interesting lines.'''

    line = r.readline().strip()
    while line:
        if line[:4] == 'CMNT':
            return ''
        line = r.readline().strip()
    return ''

def read_molecule(r):
    '''Read a single molecule from reader r and return it,
    or return None to signal end of file.'''

    # If there isn't another line, we're at the end of the file.
    line = get_line(r)
    if not line:
        return None

    # Name of the molecule: "COMPND   name"
    key, name = line.split()
    
    # Other lines are either "END" or "ATOM num type x y z"
    molecule = [name]
    reading = True

    while reading:
        line = get_line(r)
        if line.startswith('END'):
            reading = False
        else:
            key, num, type, x, y, z = line.split()
            molecule.append((type, x, y, z))

    return molecule