Book Cover

Chapter 9

Processing Data in R and Python

Note: The examples below are abridged; the book contains more details.

  1. Missing Data in R - Part I
  2. Missing Data in R - Part II
  3. Missing Data in Python
  4. Outliers
  5. Skewness and Power Transformation - Part I
  6. Skewness and Power Transformation - Part II
  7. Binning
  8. Indicator Variables
  9. Random Sampling, Partitioning, and Shuffling
  10. Concatenations and Joins
  11. Reshaping Data
  12. The Split-Apply-Combine Framework

Missing Data in R - Part I

The code below analyzes the dataframe movies in the ggplot2movies package with some missing values:

library(ggplot2movies)
names(movies)
movies[9000:9020, 1:6]  # display 20 rows, 6 first columns
mean(movies$length)
mean(movies$budget)
mean(movies$budget, na.rm = TRUE)
mean(is.na(movies$budget))

moviesNoNA = na.omit(movies)
qplot(rating, budget, data = moviesNoNA, size = I(1.2)) +
  stat_smooth(color = "red", size = I(2), se = F)

Missing Data in R - Part II

library(ggplot2movies)
moviesNoNA = na.omit(movies)
qplot(rating, votes, data = moviesNoNA, size = I(1.2))

Missing Data in Python

The code below defines a small dataframe with missing values, and demonstrates the isnull, dropna, and fillna methods:

import numpy as np
import pandas as pd

data = pd.DataFrame([[1, 2, np.nan],
                    [3, np.nan, 4],
                    [1, 2, 3]])

# Example 1 - the isnull method
print(f"data:\n{data}")
print("pd.isnull(data):")
print(pd.isnull(data))

# Example 2 - the dropna method
# Drop rows
print("data.dropna():")
print(data.dropna())
# Drop columns
print("data.dropna(axis = 1):")
print(data.dropna(axis = 1))

# Example 3 - the fillna method
# Fill in missing entries with column means
print("data.fillna(data.mean()):")
print(data.fillna(data.mean()))

Outliers

# Example 1 - the Black Monday stock crash on October 19, 1987
library(Ecdat)
data(SP500, package = 'Ecdat')
qplot(r500,
      main = "Histogram of log(P(t)/P(t-1)) for SP500 (1981-91)",
      xlab = "log returns",
      data = SP500)
qplot(seq(along = r500),
      r500,
      data = SP500,
      geom = "line",
      xlab = "trading days since January 1981",
      ylab = "log returns",
      main = "log(P(t)/P(t-1)) for SP500 (1981-91)")

# Example 2 - the R code below removes outliers
original_data = rnorm(20) 
original_data[1] = 1000  
sorted_data = sort(original_data)  
filtered_data = sorted_data[3:18]  
lower_limit = mean(filtered_data) - 5 * sd(filtered_data)  
upper_limit = mean(filtered_data) + 5 * sd(filtered_data)
not_outlier_ind = (lower_limit < original_data) & 
  (original_data < upper_limit)
print(not_outlier_ind)  
data_w_no_outliers = original_data[not_outlier_ind]

# Example 3 - winsorizes data containing an outlier
library(robustHD)
original_data = c(1000, rnorm(10))
print(original_data)
print(winsorize(original_data))

Skewness and Power Transformation - Part I

# Example 1 - the diamonds dataset
head(diamonds)
summary(diamonds)

# Example 2 - the price histogram of a random subset of 1000 diamonds
diamondsSubset = diamonds[sample(dim(diamonds)[1], 1000), ]
qplot(price, data = diamondsSubset)

# Example 3 - a transformed histogram
qplot(log(price), size = I(1), data = diamondsSubset)

# Example 4 - transforming both variables
qplot(carat,
      price,
      size = I(1),
      data = diamondsSubset)
qplot(carat,
      log(price),
      size = I(1),
      data = diamondsSubset)
qplot(log(carat),
      price,
      size = I(1),
      data = diamondsSubset)
qplot(log(carat),
      log(price),
      size = I(1),
      data = diamondsSubset)

# Example 5 - display the original un-transformed variables on logarithmic axes
qplot(carat,
      price,
      log = "xy",
      size = I(1),
      data = diamondsSubset)

Skewness and Power Transformation - Part II

# Example 1 - the Animals dataset
library(MASS)
Animals
qplot(brain, body, data = Animals)

# Example 2 - a power transformation
qplot(brain, body, log = "xy", data = Animals)

Binning

# Example 1 - binning
import numpy as np
import pandas as pd

data = [23, 13, 5, 3, 41, 33]
bin_boundaries = [0, 10, 20, 30, 40, 50]
bin_values = [5, 15, 25, 35, 45]
cut_data = pd.cut(data, bin_boundaries)
print("labels:", cut_data.codes)
binned_data = np.zeros(shape = np.size(data))
for k, x in enumerate(cut_data.codes):
    binned_data[k] = bin_values[x]
print("binned_data:", binned_data)

# Example 2 - quantile binning
original_data = [23, 13, 5, 3, 41, 33]
cut_data = pd.qcut(original_data, 2)
print("C.labels:", cut_data.codes)

Indicator Variables

import numpy as np
import pandas as pd

data = [23, 13, 5, 3, 41, 33]
indicator_values = pd.get_dummies(pd.qcut(data, 2))
print("indicator_values:", indicator_values)

Random Sampling, Partitioning, and Shuffling

# Example 1 - sample 3 times from the values [1, 2, 3, 4] without replacements
sampled_row_indices = sample(1:4, 3, replace = FALSE)
print(sampled_row_indices)

# Example 2 - sampled rows
d = array(data = seq(1, 20, length.out = 20), dim = c(4, 5))
d_sampled = d[sampled_row_indices, ]
print(d_sampled)

# Example 3 - partition an array into two new arrays of sizes 75% and 25%
d = array(data = seq(1, 20, length.out = 20), dim = c(4, 5))
print(d)
rand_perm = sample(4, 4)
first_set_of_indices = rand_perm[1:floor(4 * .75)]
second_set_of_indices = rand_perm[(floor(4 * .75) + 1):4]
print(d[first_set_of_indices, ])
print(d[second_set_of_indices, ])

# Example 4 - shuffling
d = array(data = seq(1, 20, length.out = 20), dim = c(4, 5))
print(d)
d_shuffled = d[sample(4, 4), ]
print(d_shuffled)

Concatenations and Joins

import numpy as np
import pandas as pd

# Example 1 - concatenate two dataframes
data1 = {"ID" : ["2134", "4524"],
         "name" : ["John Smith", "Jane Doe"]}
df1 = pd.DataFrame(data1)
data2 = {"ID" : ["9423", "3483"],
         "name" : ["Bob Jones", "Mary Smith"]}
df2 = pd.DataFrame(data2)
df3 = pd.concat([df1, df2])
print("concatenation of df1 and df2:\n" + str(df3))

# Example 2 - concatenating dataframes with non identical columns
data1 = {"ID" : ["2134", "4524"],
         "name" : ["John Smith", "Jane Doe"]}
df1 = pd.DataFrame(data1)
data2 = {"ID" : ["9423", "3483"],
         "nick name" : ["Bobby", "Abby"]}
df2 = pd.DataFrame(data2)
df3 = pd.concat([df1, df2])
print("concatenation of df1 and df2:\n" + str(df3))

# Example 3 - inner join
data1 = {"ID" : ["2134", "4524"],
         "name" : ["John Smith", "Jane Doe"]}
df1 = pd.DataFrame(data1)
data2 = {"ID" : ["6325", "2134"],
         "age" : [25, 35],
         "tenure" : [3, 8]}
df2 = pd.DataFrame(data2)
df3 = pd.merge(df1, df2, on = 'ID', how = 'inner')
print("inner join of df1 and df2:\n" + str(df3))

# Example 3 - outer and left joins
data1 = {"ID" : ["2134", "4524"],
         "name" : ["John Smith", "Jane Doe"]}
df1 = pd.DataFrame(data1)
data2 = {"ID" : ["6325", "2134"],
         "age" : [25, 35],
         "tenure" : [3, 8]}
df2 = pd.DataFrame(data2)
df3 = pd.merge(df1, df2, on = 'ID', how = 'outer')
print("outer join of df1 and df2:\n" + str(df3))
df4 = pd.merge(df1, df2, on = 'ID', how = 'left')
print("left join of df1 and df2:\n" + str(df4))

# Example 4 - outer join with multiple records with the same key
data1 = {"ID" : ["2134", "4524", "2134"],
         "name" : ["John Smith", "Jane Doe", "JOHN SMITH"]}
df1 = pd.DataFrame(data1)
data2 = {"ID" : ["6325", "2134"],
         "age" : [25, 35],
         "tenure" : [3, 8]}
df2 = pd.DataFrame(data2)
df3 = pd.merge(df1, df2, on = 'ID', how = 'outer')
print("outer join of df1 and df2:\n" + str(df3))

# Example 5 - outer join with attribute name suffix
data1 = {"ID" : ["2134", "4524"],
         "f1" : ["John Smith", "Jane Doe"]}
df1 = pd.DataFrame(data1)
data2 = {"ID" : ["6325", "2134"],
         "f1" : [25, 35],
         "f2" : [3, 8]}
df2 = pd.DataFrame(data2)
df3 = pd.merge(df1, df2, on = 'ID', how = 'outer')
print("outer join of df1 and df2:\n" + str(df3))

Reshaping Data

# Example 1 - the melt function
library(reshape2)
# toy (wide) dataframe in the reshape2 package
smiths
# columns 2, 3, 4, 5 are measurements, 1 is key
melt(smiths, id = 1)
# columns 3, 4, 5 are measurements, 1,2 are key
melt(smiths, id = c(1, 2))

# Example 2 - the tips dataset
tips$total.bill = tips$total_bill
qplot(total.bill,
      tip,
      facets = sex~time,
      size = I(1.5),
      data = tips)

# Example 2 - analyzing tips and bills
head(tips)  # first six rows
tipsm = melt(tips,
             id = c("sex","smoker","day","time","size"))
head(tipsm)  # first six rows
tail(tipsm)  # last six rows
# Mean of measurement variables broken by sex.
# Note the role of mean as the aggregating function.
dcast(tipsm,
      sex~variable,
      fun.aggregate = mean)
# Number of occurrences for measurement variables broken by sex.
# Note the role of length as the aggregating function.
dcast(tipsm,
      sex~variable,
      fun.aggregate = length)
# Average total bill and tip for different times
dcast(tipsm,
      time~variable,
      fun.aggregate = mean)
# Similar to above with breakdown for sex and time:
dcast(tipsm,
      sex+time~variable,
      fun.aggregate = length)
# Similar to above, but with mean and added margins
dcast(tipsm,
      sex+time~variable,
      fun.aggregate = mean,
      margins = TRUE)

The Split-Apply-Combine Framework

# Example 1 - the ddply function
library(plyr)
head(baseball)
# Count number of players recorded for each year
bbPerYear = ddply(baseball, "year", "nrow")
head(bbPerYear)
qplot(x = year,
      y = nrow,
      data = bbPerYear,
      geom = "line",
      ylab = "number of player seasons")

# Example 2 - another example of the ddply function
# Compute mean rbi (batting attempt resulting in runs)
# for all years. Summarize is the apply function, which
# takes as argument a function that computes the rbi mean
bbMod = ddply(baseball,
              "year",
              summarise,
              mean.rbi = mean(rbi, na.rm = TRUE))
head(bbMod)
qplot(x = year,
      y = mean.rbi,
      data = bbMod,
      geom = "line",
      ylab = "mean RBI")

# Example 3 - adding a new variable
# Add a column career.year which measures the number of years
# passed since each player started batting
bbMod2 = ddply(baseball,
               "id",
               transform,
               career.year = year - min(year) + 1)
# Sample a random subset 3000 rows to avoid over-plotting
bbSubset = bbMod2[sample(dim(bbMod2)[1], 3000), ]
qplot(career.year,
      rbi, data = bbSubset,
      size = I(0.8),
      geom = "jitter",
      ylab = "RBI",
      xlab = "years of playing") +
  geom_smooth(color = "red", se = F, size = 1.5)

# Example 4 - the aaply function
dim(ozone)
latitude.mean = aaply(ozone, 1, mean)
longitude.mean = aaply(ozone, 2, mean)
time.mean = aaply(ozone, 3, mean)
longitude = seq(along = longitude.mean)
qplot(x = longitude,
      y = longitude.mean,
      ylab = "mean ozone level")
latitude = seq(along = latitude.mean)
qplot(x = latitude,
      y = latitude.mean,
      ylab = "mean ozone level",
      geom = "line")
months = seq(along = time.mean)
qplot(x = months,
      y = time.mean,
      geom = "line",
      ylab = "mean ozone level",
      xlab = "months since January 1985")