Summarising Data
bmore_rstrnts <- read_csv('https://data.baltimorecity.gov/api/views/k5ry-ef3g/rows.csv?accessType=DOWNLOAD')
## Parsed with column specification:
## cols(
## name = col_character(),
## zipCode = col_double(),
## neighborhood = col_character(),
## councilDistrict = col_double(),
## policeDistrict = col_character(),
## `Location 1` = col_character()
## )
Take a quick glance at the data.
# Look at the first few entries
head(bmore_rstrnts)
## # A tibble: 6 x 6
## name zipCode neighborhood councilDistrict policeDistrict `Location 1`
## <chr> <dbl> <chr> <dbl> <chr> <chr>
## 1 410 21206 Frankford 2 NORTHEASTERN "4509 BELAIR…
## 2 1919 21231 Fells Point 1 SOUTHEASTERN "1919 FLEET …
## 3 SAUTE 21224 Canton 1 SOUTHEASTERN "2844 HUDSON…
## 4 #1 CHI… 21211 Hampden 14 NORTHERN "3998 ROLAND…
## 5 #1 chi… 21223 Millhill 9 SOUTHWESTERN "2481 freder…
## 6 19TH H… 21218 Clifton Park 14 NORTHEASTERN "2722 HARFOR…
# Look at the last few entries.
tail(bmore_rstrnts)
## # A tibble: 6 x 6
## name zipCode neighborhood councilDistrict policeDistrict `Location 1`
## <chr> <dbl> <chr> <dbl> <chr> <chr>
## 1 ZEEBA … 21230 Federal Hill 10 SOUTHERN "916 LIGHT S…
## 2 ZEN WE… 21212 Rosebank 4 NORTHERN "5916 YORK R…
## 3 ZIASCOS 21231 Washington … 1 SOUTHEASTERN "1313 PRATT …
## 4 "ZINK'… 21213 Belair-Edis… 13 NORTHEASTERN "3300 LAWNVI…
## 5 ZISSIM… 21211 Hampden 7 NORTHERN "1023 36TH S…
## 6 ZORBAS 21224 Greektown 2 SOUTHEASTERN "4710 EASTER…
# A summary of the data frame
summary(bmore_rstrnts)
## name zipCode neighborhood councilDistrict
## Length:1327 Min. :-21226 Length:1327 Min. : 1.000
## Class :character 1st Qu.: 21202 Class :character 1st Qu.: 2.000
## Mode :character Median : 21218 Mode :character Median : 9.000
## Mean : 21185 Mean : 7.191
## 3rd Qu.: 21226 3rd Qu.:11.000
## Max. : 21287 Max. :14.000
## policeDistrict Location 1
## Length:1327 Length:1327
## Class :character Class :character
## Mode :character Mode :character
##
##
##
We can look at the quantiles of quantitative variables.
quantile(bmore_rstrnts$councilDistrict)
## 0% 25% 50% 75% 100%
## 1 2 9 11 14
Can create a table. The useNA
argument controls whether the table includes NA values. You can use: - ‘no’ - don’t include. - ‘ifany’ - only of the count is positive. - ‘always’ - even if there are zero counts.
The default is no.
table(bmore_rstrnts$zipCode, useNA = 'ifany')
##
## -21226 21201 21202 21205 21206 21207 21208 21209 21210 21211
## 1 136 201 27 30 4 1 8 23 41
## 21212 21213 21214 21215 21216 21217 21218 21220 21222 21223
## 28 31 17 54 10 32 69 1 7 56
## 21224 21225 21226 21227 21229 21230 21231 21234 21237 21239
## 199 19 18 4 13 156 127 7 1 3
## 21251 21287
## 2 1
Can use two variables:
table(bmore_rstrnts$councilDistrict, bmore_rstrnts$zipCode)
##
## -21226 21201 21202 21205 21206 21207 21208 21209 21210 21211 21212
## 1 0 0 37 0 0 0 0 0 0 0 0
## 2 0 0 0 3 27 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 27
## 5 0 0 0 0 0 3 0 6 0 0 0
## 6 0 0 0 0 0 0 0 1 19 0 0
## 7 0 0 0 0 0 0 0 1 0 27 0
## 8 0 0 0 0 0 1 0 0 0 0 0
## 9 0 1 0 0 0 0 0 0 0 0 0
## 10 1 0 1 0 0 0 0 0 0 0 0
## 11 0 115 139 0 0 0 1 0 0 0 1
## 12 0 20 24 4 0 0 0 0 0 0 0
## 13 0 0 0 20 3 0 0 0 0 0 0
## 14 0 0 0 0 0 0 0 0 4 14 0
##
## 21213 21214 21215 21216 21217 21218 21220 21222 21223 21224 21225
## 1 2 0 0 0 0 0 0 7 0 140 1
## 2 0 0 0 0 0 0 0 0 0 54 0
## 3 2 17 0 0 0 3 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 31 0 0 0 0 0 0 0 0
## 6 0 0 15 1 0 0 0 0 0 0 0
## 7 0 0 6 7 15 6 0 0 0 0 0
## 8 0 0 0 0 0 0 0 0 2 0 0
## 9 0 0 0 2 8 0 0 0 53 0 0
## 10 0 0 0 0 0 0 1 0 0 0 18
## 11 0 0 0 0 9 0 0 0 1 0 0
## 12 13 0 0 0 0 26 0 0 0 0 0
## 13 13 0 1 0 0 0 0 0 0 5 0
## 14 1 0 1 0 0 34 0 0 0 0 0
##
## 21226 21227 21229 21230 21231 21234 21237 21239 21251 21287
## 1 0 0 0 1 124 0 0 0 0 0
## 2 0 0 0 0 0 0 1 0 0 0
## 3 0 1 0 0 0 7 0 0 2 0
## 4 0 0 0 0 0 0 0 3 0 0
## 5 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0
## 7 0 0 0 0 0 0 0 0 0 0
## 8 0 2 13 0 0 0 0 0 0 0
## 9 0 0 0 11 0 0 0 0 0 0
## 10 18 0 0 133 0 0 0 0 0 0
## 11 0 0 0 11 0 0 0 0 0 0
## 12 0 0 0 0 2 0 0 0 0 0
## 13 0 1 0 0 1 0 0 0 0 1
## 14 0 0 0 0 0 0 0 0 0 0
Check For Missing Values
# How many missing values
sum(is.na(bmore_rstrnts$councilDistrict))
## [1] 0
# Could also see if there are any missing values
any(is.na(bmore_rstrnts$councilDistrict))
## [1] FALSE
# Are all the zipcodes greater than 0
all(bmore_rstrnts$zipCode > 0)
## [1] FALSE
Row and Column Sums
Check which columns have NA values
colSums(is.na(bmore_rstrnts))
## name zipCode neighborhood councilDistrict
## 0 0 0 0
## policeDistrict Location 1
## 0 0
Values With Specific Characteristics
table(bmore_rstrnts$zipCode %in% c('21212'))
##
## FALSE TRUE
## 1299 28
head(bmore_rstrnts[bmore_rstrnts$zipCode %in% c('21212'), ])
## # A tibble: 6 x 6
## name zipCode neighborhood councilDistrict policeDistrict `Location 1`
## <chr> <dbl> <chr> <dbl> <chr> <chr>
## 1 BAY A… 21212 Downtown 11 CENTRAL "206 REDWOOD…
## 2 ATWAT… 21212 Chinquapin P… 4 NORTHERN "529 BELVEDE…
## 3 CAFE … 21212 Rosebank 4 NORTHERN "438 BELVEDE…
## 4 CERIE… 21212 Chinquapin P… 4 NORTHERN "529 BELVEDE…
## 5 DUNKI… 21212 Homeland 4 NORTHERN "5422 YORK R…
## 6 FIELD… 21212 Mid-Govans 4 NORTHERN "5723 YORK R…
Cross Tabs
UCBAdmissions %>% as.data.frame() %>% summary()
## Admit Gender Dept Freq
## Admitted:12 Male :12 A:4 Min. : 8.0
## Rejected:12 Female:12 B:4 1st Qu.: 80.0
## C:4 Median :170.0
## D:4 Mean :188.6
## E:4 3rd Qu.:302.5
## F:4 Max. :512.0
xtabs(Freq ~ Gender + Admit, data = UCBAdmissions)
## Admit
## Gender Admitted Rejected
## Male 1198 1493
## Female 557 1278
Flat Tables
With lots of variables the cross-tabulation becomes very hard to read. Can use ftable()
instead.
warpbreaks$replicate <- rep(1:9, len = 54)
xtabs(breaks ~ ., data = warpbreaks)
## , , replicate = 1
##
## tension
## wool L M H
## A 26 18 36
## B 27 42 20
##
## , , replicate = 2
##
## tension
## wool L M H
## A 30 21 21
## B 14 26 21
##
## , , replicate = 3
##
## tension
## wool L M H
## A 54 29 24
## B 29 19 24
##
## , , replicate = 4
##
## tension
## wool L M H
## A 25 17 18
## B 19 16 17
##
## , , replicate = 5
##
## tension
## wool L M H
## A 70 12 10
## B 29 39 13
##
## , , replicate = 6
##
## tension
## wool L M H
## A 52 18 43
## B 31 28 15
##
## , , replicate = 7
##
## tension
## wool L M H
## A 51 35 28
## B 41 21 15
##
## , , replicate = 8
##
## tension
## wool L M H
## A 26 30 15
## B 20 39 16
##
## , , replicate = 9
##
## tension
## wool L M H
## A 67 36 26
## B 44 29 28
head(ftable(warpbreaks))
##
## "replicate" "1" "2" "3" "4" "5" "6" "7" "8" "9"
## "breaks" "wool" "tension"
## "10" "A" "L" 0 0 0 0 0 0 0 0 0
## "M" 0 0 0 0 0 0 0 0 0
## "H" 0 0 0 0 1 0 0 0 0
## "B" "L" 0 0 0 0 0 0 0 0 0
## "M" 0 0 0 0 0 0 0 0 0
## "H" 0 0 0 0 0 0 0 0 0
Size in Memory
Use object.size()
.
big_vec <- 1:1000000000
object.size(big_vec)
## 4000000048 bytes
print(object.size(big_vec), units = 'Mb')
## 3814.7 Mb
Creating New Variables
Often the raw data doesn’t have a value you’re looking for. You need to transform the data to get the values you would like.
Common variables: - ‘Missingness’ indicators. - ‘Cutting Up’ quantitative variables. - Applying transforms.
Creating Categorical Variables
Use cut()
to cut up quantitative variables.
bmore_rstrnts$zipGroups <- cut(bmore_rstrnts$zipCode, breaks = quantile(bmore_rstrnts$zipCode))
table(bmore_rstrnts$zipGroups)
##
## (-2.123e+04,2.12e+04] (2.12e+04,2.122e+04] (2.122e+04,2.123e+04]
## 337 375 282
## (2.123e+04,2.129e+04]
## 332
Can use cut2()
from the Hmisc
library to make it easier:
library(Hmisc)
bmore_rstrnts$zipGroups <- cut2(bmore_rstrnts$zipCode, g = 4)
table(bmore_rstrnts$zipGroups)
##
## [-21226,21205) [ 21205,21220) [ 21220,21227) [ 21227,21287]
## 338 375 300 314
Reshaping Data
You can melt the data frame, pulling the measurement variables into a single column:
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
car_data <- mtcars %>%
as_tibble() %>%
rownames_to_column(var = 'carname')
car_melt <- car_data %>% melt(id = c('carname', 'gear', 'cyl'), measure.vars = c('mpg', 'hp'))
Casting
The dcast()
function allows you to recast the frame into a different shape. We see cylinders in terms of mpg
and hp
.
# Standard
car_melt %>% dcast(cyl ~ variable)
## Aggregation function missing: defaulting to length
## cyl mpg hp
## 1 4 11 11
## 2 6 7 7
## 3 8 14 14
# Can also pass a different function to summarise with
car_melt %>% dcast(cyl ~ variable,mean)
## cyl mpg hp
## 1 4 26.66364 82.63636
## 2 6 19.74286 122.28571
## 3 8 15.10000 209.21429
Averaging Variables
Use tapply()
:
tapply(InsectSprays$count, InsectSprays$spray, sum)
## A B C D E F
## 174 184 25 59 42 200
Can also use split()
:
InsectSplits <- split(InsectSprays$count, InsectSprays$spray)
InsectSplits
## $A
## [1] 10 7 20 14 14 12 10 23 17 20 14 13
##
## $B
## [1] 11 17 21 11 16 14 17 17 19 21 7 13
##
## $C
## [1] 0 1 7 2 3 1 2 1 3 0 1 4
##
## $D
## [1] 3 5 12 6 4 3 5 5 5 5 2 4
##
## $E
## [1] 3 5 3 5 3 6 1 1 3 2 6 4
##
## $F
## [1] 11 9 15 22 15 16 13 10 26 26 24 13
InsectSplits <- lapply(InsectSplits, sum)
unlist(InsectSplits)
## A B C D E F
## 174 184 25 59 42 200
Merging Data
The merge()
function can merge data frames together. By default it merges by all of the columns that have a common name.
a <- tibble(
aa = 1:5,
numbera_a = 6:10,
normal_a = rnorm(5)
)
b <- tibble(
aa = 1:5,
number_b = 6:10,
normal_b = rnorm(5)
)
merge(a, b)
## aa numbera_a normal_a number_b normal_b
## 1 1 6 0.3841539 6 -0.28549072
## 2 2 7 0.2567812 7 -0.87727129
## 3 3 8 -1.5718964 8 -0.06147564
## 4 4 9 1.6131016 9 -0.98670425
## 5 5 10 -0.3146985 10 0.34195511
merge(a, b, by = 'aa')
## aa numbera_a normal_a number_b normal_b
## 1 1 6 0.3841539 6 -0.28549072
## 2 2 7 0.2567812 7 -0.87727129
## 3 3 8 -1.5718964 8 -0.06147564
## 4 4 9 1.6131016 9 -0.98670425
## 5 5 10 -0.3146985 10 0.34195511
An alternative is to use *_join()
functons from dplyr
.
full_join(a,b)
## Joining, by = "aa"
## # A tibble: 5 x 5
## aa numbera_a normal_a number_b normal_b
## <int> <int> <dbl> <int> <dbl>
## 1 1 6 0.384 6 -0.285
## 2 2 7 0.257 7 -0.877
## 3 3 8 -1.57 8 -0.0615
## 4 4 9 1.61 9 -0.987
## 5 5 10 -0.315 10 0.342