This chapter is all about data in the various shapes, forms, and formats they take. We explore different ways to format and store data: JSON, databases, SQL, NoSQL, and memory mapping. We also discuss concepts like atomicity, consistency, isolation, and durability of data transactions.
Note: The examples below are abridged; the book contains more details.
Note that JSON is very similar to legitimate Python code, making Python particularly easy for interacting with Python code:
import json
# creating a Python object (map) holding a person's address
personRecord = {
"firstName": "Jane",
"lastName": "Doe",
"yearOfBirth": 1975,
"address": {
"streetAddress": "100 Main Street",
"city": "Los Angeles",
"state": "CA",
"postalCode": "90021"
},
"phoneNumbers": [
{ "type": "home", "number": "(444) 555-1234" },
{ "type": "office", "number": "(444) 555-1235" }
]
}
# convert the Python object into a JSON string
personRecordJSON = json.dumps(personRecord)
personRecordReconstructed = json.loads(personRecordJSON)
# print the firstName field of the reconstructed Python object
print(personRecordReconstructed['firstName'])
Python’s pickle module is an alternative to JSON serialization:
# Example 1 - the pickle module
import pickle
patients = {
"age" : [25.2, 35.4, 52.1],
"height" : [68.1, 62.5, 60.5],
"weight" : [170.2, 160.7, 185.5]
}
# binary serialization
serialized_patients = pickle.dumps(patients)
# deserialization
patients_reconstructed = pickle.loads(serialized_patients)
# Example 2 - pickling using pandas
import pandas as pd
patients = {
"age" : [25.2, 35.4, 52.1],
"height" : [68.1, 62.5, 60.5],
"weight" : [170.2, 160.7, 185.5]
}
patients_df = pd.DataFrame(patients)
# serialize and save to binary disk
patients_df.to_pickle('patients.txt')
# read binary serialization to Python object
patients_reconstructed = pd.read_pickle('patients.txt')
print(patients_reconstructed)
Shelves is a module that provides persistent dictionary objects whose values can be any Python object that can be pickled.
import shelve
# Example 1 - store
patients_df = {
"age" : [25.2, 35.4, 52.1],
"height" : [68.1, 62.5, 60.5],
"weight" : [170.2, 160.7, 185.5]
}
records = shelve.open('patients')
records['patients'] = patients_df
records.close()
# Example 2 - load and manipulate
records = shelve.open('patients')
patients_reconstructed = records['patients']
del records['patients'] # delete value from disk
records.close()
Java’s serialization framework provides a convenient mechanism to serialize arbitrary objects into a binary sequence:
import java.io.Serializable;
public class PatientInfo implements Serializable {
private static final long serialVersionUID = 42;
private final String name;
private final int age;
public PatientInfo(final String name, final int age) {
this.name = name;
this.age = age;
}
}
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.List;
public class PersistPatientInfoExample {
public static void main(final String[] args) {
final String filename = "patients";
final PatientInfo patient1 = new PatientInfo("John Smith", 33);
final PatientInfo patient2 = new PatientInfo("Jane Doe", 30);
final List<PatientInfo> PatientList = new ArrayList<>();
PatientList.add(patient1);
PatientList.add(patient2);
try {
final FileOutputStream fos = new FileOutputStream(filename);
final ObjectOutputStream out = new ObjectOutputStream(fos);
out.writeObject(PatientList);
out.close();
} catch (IOException ex) {
ex.printStackTrace();
}
}
}
import java.io.FileInputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.util.ArrayList;
import java.util.List;
public class GetPatientInfoExample {
public static void main(final String[] args) {
try {
final FileInputStream fis = new FileInputStream("patients");
final ObjectInputStream in = new ObjectInputStream(fis);
final List patientsInfo = (ArrayList) in.readObject();
in.close();
System.out.println(patientsInfo.size() + " patients found");
} catch (IOException ex) {
ex.printStackTrace();
} catch (ClassNotFoundException ex) {
ex.printStackTrace();
}
}
}
cd chapter13
javac PatientInfo.java PersistPatientInfoExample.java GetPatientInfoExample.java
java PersistPatientInfoExample && java GetPatientInfoExample
Python offers a particularly convenient mechanism for accessing data in HDF files using the PyTables module:
from tables import *
# creating an object representing table rows
class Person(IsDescription):
name = StringCol(16) # 16 character string
age = Int32Col()
# opening a HDF5 file
h5file = open_file("people.h5", mode = "w", title = "personnel")
# definining a group in the HDF4 file
group = h5file.create_group("/", "records", "names and ages")
# defining a table in the group
table = h5file.create_table(group, "Personnel1",
Person, "Personnel Table 1")
Person = table.row
for ind in xrange(10):
Person['name'] = 'John Doe ' + str(ind)
Person['age'] = 20 + ind
Person.append()
for ind in xrange(10):
Person['name'] = 'Jane Smith ' + str(ind)
Person['age'] = 20 + ind
Person.append()
table.flush() # update disk
# getting a pointer to table
table = h5file.root.records.Personnel1
# running a query against file
arr = [x['name'] for x in table.iterrows() if
x['age'] > 25 and x['age'] < 28]
print(arr)
The following code creates a table, then adds two rows, and then sends a query and displaying the results:
import sqlite3
import pandas.io.sql
# create a connection
con = sqlite3.connect(':memory')
# create a table
createStmt = """
CREATE TABLE LOCATIONS(
ID INTEGER PRIMARY KEY,
CITY CHAR(20),
STATE CHAR(20),
COUNTRY CHAR (20) NOT NULL);
"""
con.execute(createStmt)
con.commit()
# add two rows to the table
data = [(13, 'Los Angeles', 'CA', 'USA'),
(21, 'Chicago', 'IL', 'USA')]
addStmt = "INSERT INTO LOCATIONS VALUES (?, ?, ?, ?)"
con.executemany(addStmt, data)
con.commit()
# query the databse
data = con.execute('select * from LOCATIONS')
print(data.fetchall())
print(data.description)