Chapter preview

Chapter 9

Processing Data in R and Python

There is no shortcut to knowledge; and there are no worthwhile data without preprocessing. In the first three sections of this chapter, we discuss situations that necessitate data preprocessing and how to handle them. In the final section we discuss how to manipulate data in general; specifically, how to manipulate data in R using the reshape2 and plyr packages and in Python using the pandas module.

Note: The examples below are abridged; the book contains more details.

  1. Missing Data in R - Part I
  2. Missing Data in R - Part II
  3. Missing Data in Python
  4. Outliers
  5. Skewness and Power Transformation - Part I
  6. Skewness and Power Transformation - Part II
  7. Binning
  8. Indicator Variables
  9. Random Sampling, Partitioning, and Shuffling
  10. Concatenations and Joins
  11. Reshaping Data
  12. The Split-Apply-Combine Framework

Missing Data in R - Part I

The code below analyzes the dataframe movies in the ggplot2movies package with some missing values:

library(ggplot2movies)
names(movies)
movies[9000:9020, 1:6]  # display 20 rows, 6 first columns
mean(movies$length)
mean(movies$budget)
mean(movies$budget, na.rm = TRUE)
mean(is.na(movies$budget))

moviesNoNA = na.omit(movies)
qplot(rating, budget, data = moviesNoNA, size = I(1.2)) +
  stat_smooth(color = "red", size = I(2), se = F)

Missing Data in R - Part II

library(ggplot2movies)
moviesNoNA = na.omit(movies)
qplot(rating, votes, data = moviesNoNA, size = I(1.2))

Missing Data in Python

The code below defines a small dataframe with missing values, and demonstrates the isnull, dropna, and fillna methods:

import numpy as np
import pandas as pd

data = pd.DataFrame([[1, 2, np.nan],
                    [3, np.nan, 4],
                    [1, 2, 3]])

# Example 1 - the isnull method
print(f"data:\n{data}")
print("pd.isnull(data):")
print(pd.isnull(data))

# Example 2 - the dropna method
# Drop rows
print("data.dropna():")
print(data.dropna())
# Drop columns
print("data.dropna(axis = 1):")
print(data.dropna(axis = 1))

# Example 3 - the fillna method
# Fill in missing entries with column means
print("data.fillna(data.mean()):")
print(data.fillna(data.mean()))

Outliers

# Example 1 - the Black Monday stock crash on October 19, 1987
library(Ecdat)
data(SP500, package = 'Ecdat')
qplot(r500,
      main = "Histogram of log(P(t)/P(t-1)) for SP500 (1981-91)",
      xlab = "log returns",
      data = SP500)
qplot(seq(along = r500),
      r500,
      data = SP500,
      geom = "line",
      xlab = "trading days since January 1981",
      ylab = "log returns",
      main = "log(P(t)/P(t-1)) for SP500 (1981-91)")

# Example 2 - the R code below removes outliers
original_data = rnorm(20) 
original_data[1] = 1000  
sorted_data = sort(original_data)  
filtered_data = sorted_data[3:18]  
lower_limit = mean(filtered_data) - 5 * sd(filtered_data)  
upper_limit = mean(filtered_data) + 5 * sd(filtered_data)
not_outlier_ind = (lower_limit < original_data) & 
  (original_data < upper_limit)
print(not_outlier_ind)  
data_w_no_outliers = original_data[not_outlier_ind]

# Example 3 - winsorizes data containing an outlier
library(robustHD)
original_data = c(1000, rnorm(10))
print(original_data)
print(winsorize(original_data))

Skewness and Power Transformation - Part I

# Example 1 - the diamonds dataset
head(diamonds)
summary(diamonds)

# Example 2 - the price histogram of a random subset of 1000 diamonds
diamondsSubset = diamonds[sample(dim(diamonds)[1], 1000), ]
qplot(price, data = diamondsSubset)

# Example 3 - a transformed histogram
qplot(log(price), size = I(1), data = diamondsSubset)

# Example 4 - transforming both variables
qplot(carat,
      price,
      size = I(1),
      data = diamondsSubset)
qplot(carat,
      log(price),
      size = I(1),
      data = diamondsSubset)
qplot(log(carat),
      price,
      size = I(1),
      data = diamondsSubset)
qplot(log(carat),
      log(price),
      size = I(1),
      data = diamondsSubset)

# Example 5 - display the original un-transformed variables on logarithmic axes
qplot(carat,
      price,
      log = "xy",
      size = I(1),
      data = diamondsSubset)

Skewness and Power Transformation - Part II

# Example 1 - the Animals dataset
library(MASS)
Animals
qplot(brain, body, data = Animals)

# Example 2 - a power transformation
qplot(brain, body, log = "xy", data = Animals)

Binning

# Example 1 - binning
import numpy as np
import pandas as pd

data = [23, 13, 5, 3, 41, 33]
bin_boundaries = [0, 10, 20, 30, 40, 50]
bin_values = [5, 15, 25, 35, 45]
cut_data = pd.cut(data, bin_boundaries)
print("labels:", cut_data.codes)
binned_data = np.zeros(shape = np.size(data))
for k, x in enumerate(cut_data.codes):
    binned_data[k] = bin_values[x]
print("binned_data:", binned_data)

# Example 2 - quantile binning
original_data = [23, 13, 5, 3, 41, 33]
cut_data = pd.qcut(original_data, 2)
print("C.labels:", cut_data.codes)

Indicator Variables

import numpy as np
import pandas as pd

data = [23, 13, 5, 3, 41, 33]
indicator_values = pd.get_dummies(pd.qcut(data, 2))
print("indicator_values:", indicator_values)

Random Sampling, Partitioning, and Shuffling

# Example 1 - sample 3 times from the values [1, 2, 3, 4] without replacements
sampled_row_indices = sample(1:4, 3, replace = FALSE)
print(sampled_row_indices)

# Example 2 - sampled rows
d = array(data = seq(1, 20, length.out = 20), dim = c(4, 5))
d_sampled = d[sampled_row_indices, ]
print(d_sampled)

# Example 3 - partition an array into two new arrays of sizes 75% and 25%
d = array(data = seq(1, 20, length.out = 20), dim = c(4, 5))
print(d)
rand_perm = sample(4, 4)
first_set_of_indices = rand_perm[1:floor(4 * .75)]
second_set_of_indices = rand_perm[(floor(4 * .75) + 1):4]
print(d[first_set_of_indices, ])
print(d[second_set_of_indices, ])

# Example 4 - shuffling
d = array(data = seq(1, 20, length.out = 20), dim = c(4, 5))
print(d)
d_shuffled = d[sample(4, 4), ]
print(d_shuffled)

Concatenations and Joins

import numpy as np
import pandas as pd

# Example 1 - concatenate two dataframes
data1 = {"ID" : ["2134", "4524"],
         "name" : ["John Smith", "Jane Doe"]}
df1 = pd.DataFrame(data1)
data2 = {"ID" : ["9423", "3483"],
         "name" : ["Bob Jones", "Mary Smith"]}
df2 = pd.DataFrame(data2)
df3 = pd.concat([df1, df2])
print("concatenation of df1 and df2:\n" + str(df3))

# Example 2 - concatenating dataframes with non identical columns
data1 = {"ID" : ["2134", "4524"],
         "name" : ["John Smith", "Jane Doe"]}
df1 = pd.DataFrame(data1)
data2 = {"ID" : ["9423", "3483"],
         "nick name" : ["Bobby", "Abby"]}
df2 = pd.DataFrame(data2)
df3 = pd.concat([df1, df2])
print("concatenation of df1 and df2:\n" + str(df3))

# Example 3 - inner join
data1 = {"ID" : ["2134", "4524"],
         "name" : ["John Smith", "Jane Doe"]}
df1 = pd.DataFrame(data1)
data2 = {"ID" : ["6325", "2134"],
         "age" : [25, 35],
         "tenure" : [3, 8]}
df2 = pd.DataFrame(data2)
df3 = pd.merge(df1, df2, on = 'ID', how = 'inner')
print("inner join of df1 and df2:\n" + str(df3))

# Example 3 - outer and left joins
data1 = {"ID" : ["2134", "4524"],
         "name" : ["John Smith", "Jane Doe"]}
df1 = pd.DataFrame(data1)
data2 = {"ID" : ["6325", "2134"],
         "age" : [25, 35],
         "tenure" : [3, 8]}
df2 = pd.DataFrame(data2)
df3 = pd.merge(df1, df2, on = 'ID', how = 'outer')
print("outer join of df1 and df2:\n" + str(df3))
df4 = pd.merge(df1, df2, on = 'ID', how = 'left')
print("left join of df1 and df2:\n" + str(df4))

# Example 4 - outer join with multiple records with the same key
data1 = {"ID" : ["2134", "4524", "2134"],
         "name" : ["John Smith", "Jane Doe", "JOHN SMITH"]}
df1 = pd.DataFrame(data1)
data2 = {"ID" : ["6325", "2134"],
         "age" : [25, 35],
         "tenure" : [3, 8]}
df2 = pd.DataFrame(data2)
df3 = pd.merge(df1, df2, on = 'ID', how = 'outer')
print("outer join of df1 and df2:\n" + str(df3))

# Example 5 - outer join with attribute name suffix
data1 = {"ID" : ["2134", "4524"],
         "f1" : ["John Smith", "Jane Doe"]}
df1 = pd.DataFrame(data1)
data2 = {"ID" : ["6325", "2134"],
         "f1" : [25, 35],
         "f2" : [3, 8]}
df2 = pd.DataFrame(data2)
df3 = pd.merge(df1, df2, on = 'ID', how = 'outer')
print("outer join of df1 and df2:\n" + str(df3))

Reshaping Data

# Example 1 - the melt function
library(reshape2)
# toy (wide) dataframe in the reshape2 package
smiths
# columns 2, 3, 4, 5 are measurements, 1 is key
melt(smiths, id = 1)
# columns 3, 4, 5 are measurements, 1,2 are key
melt(smiths, id = c(1, 2))

# Example 2 - the tips dataset
tips$total.bill = tips$total_bill
qplot(total.bill,
      tip,
      facets = sex~time,
      size = I(1.5),
      data = tips)

# Example 2 - analyzing tips and bills
head(tips)  # first six rows
tipsm = melt(tips,
             id = c("sex","smoker","day","time","size"))
head(tipsm)  # first six rows
tail(tipsm)  # last six rows
# Mean of measurement variables broken by sex.
# Note the role of mean as the aggregating function.
dcast(tipsm,
      sex~variable,
      fun.aggregate = mean)
# Number of occurrences for measurement variables broken by sex.
# Note the role of length as the aggregating function.
dcast(tipsm,
      sex~variable,
      fun.aggregate = length)
# Average total bill and tip for different times
dcast(tipsm,
      time~variable,
      fun.aggregate = mean)
# Similar to above with breakdown for sex and time:
dcast(tipsm,
      sex+time~variable,
      fun.aggregate = length)
# Similar to above, but with mean and added margins
dcast(tipsm,
      sex+time~variable,
      fun.aggregate = mean,
      margins = TRUE)

The Split-Apply-Combine Framework

# Example 1 - the ddply function
library(plyr)
head(baseball)
# Count number of players recorded for each year
bbPerYear = ddply(baseball, "year", "nrow")
head(bbPerYear)
qplot(x = year,
      y = nrow,
      data = bbPerYear,
      geom = "line",
      ylab = "number of player seasons")

# Example 2 - another example of the ddply function
# Compute mean rbi (batting attempt resulting in runs)
# for all years. Summarize is the apply function, which
# takes as argument a function that computes the rbi mean
bbMod = ddply(baseball,
              "year",
              summarise,
              mean.rbi = mean(rbi, na.rm = TRUE))
head(bbMod)
qplot(x = year,
      y = mean.rbi,
      data = bbMod,
      geom = "line",
      ylab = "mean RBI")

# Example 3 - adding a new variable
# Add a column career.year which measures the number of years
# passed since each player started batting
bbMod2 = ddply(baseball,
               "id",
               transform,
               career.year = year - min(year) + 1)
# Sample a random subset 3000 rows to avoid over-plotting
bbSubset = bbMod2[sample(dim(bbMod2)[1], 3000), ]
qplot(career.year,
      rbi, data = bbSubset,
      size = I(0.8),
      geom = "jitter",
      ylab = "RBI",
      xlab = "years of playing") +
  geom_smooth(color = "red", se = F, size = 1.5)

# Example 4 - the aaply function
dim(ozone)
latitude.mean = aaply(ozone, 1, mean)
longitude.mean = aaply(ozone, 2, mean)
time.mean = aaply(ozone, 3, mean)
longitude = seq(along = longitude.mean)
qplot(x = longitude,
      y = longitude.mean,
      ylab = "mean ozone level")
latitude = seq(along = latitude.mean)
qplot(x = latitude,
      y = latitude.mean,
      ylab = "mean ozone level",
      geom = "line")
months = seq(along = time.mean)
qplot(x = months,
      y = time.mean,
      geom = "line",
      ylab = "mean ozone level",
      xlab = "months since January 1985")