Course 3 - Getting and Cleaning Data - Week 3 - Notes

Greg Foletta

2019-10-25

Summarising Data

bmore_rstrnts <- read_csv('https://data.baltimorecity.gov/api/views/k5ry-ef3g/rows.csv?accessType=DOWNLOAD')
## Parsed with column specification:
## cols(
##   name = col_character(),
##   zipCode = col_double(),
##   neighborhood = col_character(),
##   councilDistrict = col_double(),
##   policeDistrict = col_character(),
##   `Location 1` = col_character()
## )

Take a quick glance at the data.

# Look at the first few entries
head(bmore_rstrnts)
## # A tibble: 6 x 6
##   name    zipCode neighborhood councilDistrict policeDistrict `Location 1` 
##   <chr>     <dbl> <chr>                  <dbl> <chr>          <chr>        
## 1 410       21206 Frankford                  2 NORTHEASTERN   "4509 BELAIR…
## 2 1919      21231 Fells Point                1 SOUTHEASTERN   "1919 FLEET …
## 3 SAUTE     21224 Canton                     1 SOUTHEASTERN   "2844 HUDSON…
## 4 #1 CHI…   21211 Hampden                   14 NORTHERN       "3998 ROLAND…
## 5 #1 chi…   21223 Millhill                   9 SOUTHWESTERN   "2481 freder…
## 6 19TH H…   21218 Clifton Park              14 NORTHEASTERN   "2722 HARFOR…
# Look at the last few entries.
tail(bmore_rstrnts)
## # A tibble: 6 x 6
##   name    zipCode neighborhood councilDistrict policeDistrict `Location 1` 
##   <chr>     <dbl> <chr>                  <dbl> <chr>          <chr>        
## 1 ZEEBA …   21230 Federal Hill              10 SOUTHERN       "916 LIGHT S…
## 2 ZEN WE…   21212 Rosebank                   4 NORTHERN       "5916 YORK R…
## 3 ZIASCOS   21231 Washington …               1 SOUTHEASTERN   "1313 PRATT …
## 4 "ZINK'…   21213 Belair-Edis…              13 NORTHEASTERN   "3300 LAWNVI…
## 5 ZISSIM…   21211 Hampden                    7 NORTHERN       "1023 36TH S…
## 6 ZORBAS    21224 Greektown                  2 SOUTHEASTERN   "4710 EASTER…
# A summary of the data frame
summary(bmore_rstrnts)
##      name              zipCode       neighborhood       councilDistrict 
##  Length:1327        Min.   :-21226   Length:1327        Min.   : 1.000  
##  Class :character   1st Qu.: 21202   Class :character   1st Qu.: 2.000  
##  Mode  :character   Median : 21218   Mode  :character   Median : 9.000  
##                     Mean   : 21185                      Mean   : 7.191  
##                     3rd Qu.: 21226                      3rd Qu.:11.000  
##                     Max.   : 21287                      Max.   :14.000  
##  policeDistrict      Location 1       
##  Length:1327        Length:1327       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 

We can look at the quantiles of quantitative variables.

quantile(bmore_rstrnts$councilDistrict)
##   0%  25%  50%  75% 100% 
##    1    2    9   11   14

Can create a table. The useNA argument controls whether the table includes NA values. You can use: - ‘no’ - don’t include. - ‘ifany’ - only of the count is positive. - ‘always’ - even if there are zero counts.

The default is no.

table(bmore_rstrnts$zipCode, useNA = 'ifany')
## 
## -21226  21201  21202  21205  21206  21207  21208  21209  21210  21211 
##      1    136    201     27     30      4      1      8     23     41 
##  21212  21213  21214  21215  21216  21217  21218  21220  21222  21223 
##     28     31     17     54     10     32     69      1      7     56 
##  21224  21225  21226  21227  21229  21230  21231  21234  21237  21239 
##    199     19     18      4     13    156    127      7      1      3 
##  21251  21287 
##      2      1

Can use two variables:

table(bmore_rstrnts$councilDistrict, bmore_rstrnts$zipCode)
##     
##      -21226 21201 21202 21205 21206 21207 21208 21209 21210 21211 21212
##   1       0     0    37     0     0     0     0     0     0     0     0
##   2       0     0     0     3    27     0     0     0     0     0     0
##   3       0     0     0     0     0     0     0     0     0     0     0
##   4       0     0     0     0     0     0     0     0     0     0    27
##   5       0     0     0     0     0     3     0     6     0     0     0
##   6       0     0     0     0     0     0     0     1    19     0     0
##   7       0     0     0     0     0     0     0     1     0    27     0
##   8       0     0     0     0     0     1     0     0     0     0     0
##   9       0     1     0     0     0     0     0     0     0     0     0
##   10      1     0     1     0     0     0     0     0     0     0     0
##   11      0   115   139     0     0     0     1     0     0     0     1
##   12      0    20    24     4     0     0     0     0     0     0     0
##   13      0     0     0    20     3     0     0     0     0     0     0
##   14      0     0     0     0     0     0     0     0     4    14     0
##     
##      21213 21214 21215 21216 21217 21218 21220 21222 21223 21224 21225
##   1      2     0     0     0     0     0     0     7     0   140     1
##   2      0     0     0     0     0     0     0     0     0    54     0
##   3      2    17     0     0     0     3     0     0     0     0     0
##   4      0     0     0     0     0     0     0     0     0     0     0
##   5      0     0    31     0     0     0     0     0     0     0     0
##   6      0     0    15     1     0     0     0     0     0     0     0
##   7      0     0     6     7    15     6     0     0     0     0     0
##   8      0     0     0     0     0     0     0     0     2     0     0
##   9      0     0     0     2     8     0     0     0    53     0     0
##   10     0     0     0     0     0     0     1     0     0     0    18
##   11     0     0     0     0     9     0     0     0     1     0     0
##   12    13     0     0     0     0    26     0     0     0     0     0
##   13    13     0     1     0     0     0     0     0     0     5     0
##   14     1     0     1     0     0    34     0     0     0     0     0
##     
##      21226 21227 21229 21230 21231 21234 21237 21239 21251 21287
##   1      0     0     0     1   124     0     0     0     0     0
##   2      0     0     0     0     0     0     1     0     0     0
##   3      0     1     0     0     0     7     0     0     2     0
##   4      0     0     0     0     0     0     0     3     0     0
##   5      0     0     0     0     0     0     0     0     0     0
##   6      0     0     0     0     0     0     0     0     0     0
##   7      0     0     0     0     0     0     0     0     0     0
##   8      0     2    13     0     0     0     0     0     0     0
##   9      0     0     0    11     0     0     0     0     0     0
##   10    18     0     0   133     0     0     0     0     0     0
##   11     0     0     0    11     0     0     0     0     0     0
##   12     0     0     0     0     2     0     0     0     0     0
##   13     0     1     0     0     1     0     0     0     0     1
##   14     0     0     0     0     0     0     0     0     0     0

Check For Missing Values

# How many missing values
sum(is.na(bmore_rstrnts$councilDistrict))
## [1] 0
# Could also see if there are any missing values
any(is.na(bmore_rstrnts$councilDistrict))
## [1] FALSE
# Are all the zipcodes greater than 0
all(bmore_rstrnts$zipCode > 0)
## [1] FALSE

Row and Column Sums

Check which columns have NA values

colSums(is.na(bmore_rstrnts))
##            name         zipCode    neighborhood councilDistrict 
##               0               0               0               0 
##  policeDistrict      Location 1 
##               0               0

Values With Specific Characteristics

table(bmore_rstrnts$zipCode %in% c('21212'))
## 
## FALSE  TRUE 
##  1299    28
head(bmore_rstrnts[bmore_rstrnts$zipCode %in% c('21212'), ])
## # A tibble: 6 x 6
##   name   zipCode neighborhood  councilDistrict policeDistrict `Location 1` 
##   <chr>    <dbl> <chr>                   <dbl> <chr>          <chr>        
## 1 BAY A…   21212 Downtown                   11 CENTRAL        "206 REDWOOD…
## 2 ATWAT…   21212 Chinquapin P…               4 NORTHERN       "529 BELVEDE…
## 3 CAFE …   21212 Rosebank                    4 NORTHERN       "438 BELVEDE…
## 4 CERIE…   21212 Chinquapin P…               4 NORTHERN       "529 BELVEDE…
## 5 DUNKI…   21212 Homeland                    4 NORTHERN       "5422 YORK R…
## 6 FIELD…   21212 Mid-Govans                  4 NORTHERN       "5723 YORK R…

Cross Tabs

UCBAdmissions %>% as.data.frame() %>% summary()
##       Admit       Gender   Dept       Freq      
##  Admitted:12   Male  :12   A:4   Min.   :  8.0  
##  Rejected:12   Female:12   B:4   1st Qu.: 80.0  
##                            C:4   Median :170.0  
##                            D:4   Mean   :188.6  
##                            E:4   3rd Qu.:302.5  
##                            F:4   Max.   :512.0
xtabs(Freq ~ Gender + Admit, data = UCBAdmissions)
##         Admit
## Gender   Admitted Rejected
##   Male       1198     1493
##   Female      557     1278

Flat Tables

With lots of variables the cross-tabulation becomes very hard to read. Can use ftable() instead.

warpbreaks$replicate <- rep(1:9, len = 54)
xtabs(breaks ~ ., data = warpbreaks)
## , , replicate = 1
## 
##     tension
## wool  L  M  H
##    A 26 18 36
##    B 27 42 20
## 
## , , replicate = 2
## 
##     tension
## wool  L  M  H
##    A 30 21 21
##    B 14 26 21
## 
## , , replicate = 3
## 
##     tension
## wool  L  M  H
##    A 54 29 24
##    B 29 19 24
## 
## , , replicate = 4
## 
##     tension
## wool  L  M  H
##    A 25 17 18
##    B 19 16 17
## 
## , , replicate = 5
## 
##     tension
## wool  L  M  H
##    A 70 12 10
##    B 29 39 13
## 
## , , replicate = 6
## 
##     tension
## wool  L  M  H
##    A 52 18 43
##    B 31 28 15
## 
## , , replicate = 7
## 
##     tension
## wool  L  M  H
##    A 51 35 28
##    B 41 21 15
## 
## , , replicate = 8
## 
##     tension
## wool  L  M  H
##    A 26 30 15
##    B 20 39 16
## 
## , , replicate = 9
## 
##     tension
## wool  L  M  H
##    A 67 36 26
##    B 44 29 28
head(ftable(warpbreaks))
##                                                                           
##                            "replicate" "1" "2" "3" "4" "5" "6" "7" "8" "9"
##  "breaks" "wool" "tension"                                                
##  "10"     "A"    "L"                     0   0   0   0   0   0   0   0   0
##                  "M"                     0   0   0   0   0   0   0   0   0
##                  "H"                     0   0   0   0   1   0   0   0   0
##           "B"    "L"                     0   0   0   0   0   0   0   0   0
##                  "M"                     0   0   0   0   0   0   0   0   0
##                  "H"                     0   0   0   0   0   0   0   0   0

Size in Memory

Use object.size().

big_vec <- 1:1000000000
object.size(big_vec)
## 4000000048 bytes
print(object.size(big_vec), units = 'Mb')
## 3814.7 Mb

Creating New Variables

Often the raw data doesn’t have a value you’re looking for. You need to transform the data to get the values you would like.

Common variables: - ‘Missingness’ indicators. - ‘Cutting Up’ quantitative variables. - Applying transforms.

Creating Categorical Variables

Use cut() to cut up quantitative variables.

bmore_rstrnts$zipGroups <- cut(bmore_rstrnts$zipCode, breaks = quantile(bmore_rstrnts$zipCode))

table(bmore_rstrnts$zipGroups)
## 
## (-2.123e+04,2.12e+04]  (2.12e+04,2.122e+04] (2.122e+04,2.123e+04] 
##                   337                   375                   282 
## (2.123e+04,2.129e+04] 
##                   332

Can use cut2() from the Hmisc library to make it easier:

library(Hmisc)
bmore_rstrnts$zipGroups <- cut2(bmore_rstrnts$zipCode, g = 4)

table(bmore_rstrnts$zipGroups)
## 
## [-21226,21205) [ 21205,21220) [ 21220,21227) [ 21227,21287] 
##            338            375            300            314

Reshaping Data

You can melt the data frame, pulling the measurement variables into a single column:

library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
car_data <- mtcars %>%
    as_tibble() %>%
    rownames_to_column(var = 'carname')

car_melt <- car_data %>% melt(id = c('carname', 'gear', 'cyl'), measure.vars = c('mpg', 'hp'))

Casting

The dcast() function allows you to recast the frame into a different shape. We see cylinders in terms of mpg and hp.

# Standard
car_melt %>% dcast(cyl ~ variable)
## Aggregation function missing: defaulting to length
##   cyl mpg hp
## 1   4  11 11
## 2   6   7  7
## 3   8  14 14
# Can also pass a different function to summarise with
car_melt %>% dcast(cyl ~ variable,mean)
##   cyl      mpg        hp
## 1   4 26.66364  82.63636
## 2   6 19.74286 122.28571
## 3   8 15.10000 209.21429

Averaging Variables

Use tapply():

tapply(InsectSprays$count, InsectSprays$spray, sum)
##   A   B   C   D   E   F 
## 174 184  25  59  42 200

Can also use split():

InsectSplits <- split(InsectSprays$count, InsectSprays$spray)
InsectSplits
## $A
##  [1] 10  7 20 14 14 12 10 23 17 20 14 13
## 
## $B
##  [1] 11 17 21 11 16 14 17 17 19 21  7 13
## 
## $C
##  [1] 0 1 7 2 3 1 2 1 3 0 1 4
## 
## $D
##  [1]  3  5 12  6  4  3  5  5  5  5  2  4
## 
## $E
##  [1] 3 5 3 5 3 6 1 1 3 2 6 4
## 
## $F
##  [1] 11  9 15 22 15 16 13 10 26 26 24 13
InsectSplits <- lapply(InsectSplits, sum)
unlist(InsectSplits)
##   A   B   C   D   E   F 
## 174 184  25  59  42 200

Merging Data

The merge() function can merge data frames together. By default it merges by all of the columns that have a common name.

a <- tibble(
    aa = 1:5,
    numbera_a = 6:10,
    normal_a = rnorm(5)
)

b <- tibble(
    aa = 1:5,
    number_b = 6:10,
    normal_b = rnorm(5)
)

merge(a, b)
##   aa numbera_a   normal_a number_b    normal_b
## 1  1         6  0.3841539        6 -0.28549072
## 2  2         7  0.2567812        7 -0.87727129
## 3  3         8 -1.5718964        8 -0.06147564
## 4  4         9  1.6131016        9 -0.98670425
## 5  5        10 -0.3146985       10  0.34195511
merge(a, b, by = 'aa')
##   aa numbera_a   normal_a number_b    normal_b
## 1  1         6  0.3841539        6 -0.28549072
## 2  2         7  0.2567812        7 -0.87727129
## 3  3         8 -1.5718964        8 -0.06147564
## 4  4         9  1.6131016        9 -0.98670425
## 5  5        10 -0.3146985       10  0.34195511

An alternative is to use *_join() functons from dplyr.

full_join(a,b)
## Joining, by = "aa"
## # A tibble: 5 x 5
##      aa numbera_a normal_a number_b normal_b
##   <int>     <int>    <dbl>    <int>    <dbl>
## 1     1         6    0.384        6  -0.285 
## 2     2         7    0.257        7  -0.877 
## 3     3         8   -1.57         8  -0.0615
## 4     4         9    1.61         9  -0.987 
## 5     5        10   -0.315       10   0.342