Reading from MySQL
Data structured in databases, tables, and fields.
Different tables are linked together using varaibles. A table in analogous to a data frame.
Test MySQL Instance
We set up a test MySQL instance using docker.
# Stop and remove any previous container running
docker stop greg_test_mysql
docker rm greg_test_mysql
# Spin up a new MySQL container.
docker run -p 3306:3306 --name greg_test_mysql -e MYSQL_ROOT_PASSWORD=d3d09j32d32 -d mysql:5.7.27
docker container ls
sleep 20
## greg_test_mysql
## greg_test_mysql
## 9b3e94d277fbbe6ed5a6985a0f0cfed41912a8defeb8948b080d5cf483193280
## CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
## 9b3e94d277fb mysql:5.7.27 "docker-entrypoint.s…" 1 second ago Up Less than a second 0.0.0.0:3306->3306/tcp, 33060/tcp greg_test_mysql
## 6833e3a9d28a candidate_lab "bash" 14 hours ago Up 14 hours focused_nobel
We retrieve and then import a test database from https://github.com/datacharmer/test_db.
cd ~/Documents/Study/Coursera_Data_Science_JHU/C3_Getting_and_Cleaning_Data
if [ ! -d test_db ]; then
git clone https://github.com/datacharmer/test_db.git
fi
cd test_db
mysql -h 127.0.0.1 -u root -pd3d09j32d32 < employees.sql
## mysql: [Warning] Using a password on the command line interface can be insecure.
## INFO
## CREATING DATABASE STRUCTURE
## INFO
## storage engine: InnoDB
## INFO
## LOADING departments
## INFO
## LOADING employees
## INFO
## LOADING dept_emp
## INFO
## LOADING dept_manager
## INFO
## LOADING titles
## INFO
## LOADING salaries
## data_load_time_diff
## 00:00:36
Connecting and Listing Databases
library(RMySQL)
## Loading required package: DBI
test_db <- dbConnect(
MySQL(),
user = 'root',
host = '127.0.0.1',
password = 'd3d09j32d32'
)
dbGetQuery(test_db, 'show databases;')
## Database
## 1 information_schema
## 2 employees
## 3 mysql
## 4 performance_schema
## 5 sys
dbDisconnect(test_db)
## [1] TRUE
Listing Tables
test_db <- dbConnect(
MySQL(),
user = 'root',
host = '127.0.0.1',
password = 'd3d09j32d32',
db = 'employees'
)
dbListTables(test_db)
## [1] "current_dept_emp" "departments" "dept_emp"
## [4] "dept_emp_latest_date" "dept_manager" "employees"
## [7] "salaries" "titles"
dbListFields(test_db, 'employees')
## [1] "emp_no" "birth_date" "first_name" "last_name" "gender"
## [6] "hire_date"
Running a Query
dbGetQuery(test_db, 'select count(*) from employees')
## count(*)
## 1 300024
dbGetQuery(test_db, 'select * from employees limit 10;')
## emp_no birth_date first_name last_name gender hire_date
## 1 10001 1953-09-02 Georgi Facello M 1986-06-26
## 2 10002 1964-06-02 Bezalel Simmel F 1985-11-21
## 3 10003 1959-12-03 Parto Bamford M 1986-08-28
## 4 10004 1954-05-01 Chirstian Koblick M 1986-12-01
## 5 10005 1955-01-21 Kyoichi Maliniak M 1989-09-12
## 6 10006 1953-04-20 Anneke Preusig F 1989-06-02
## 7 10007 1957-05-23 Tzvetan Zielinski F 1989-02-10
## 8 10008 1958-02-19 Saniya Kalloufi M 1994-09-15
## 9 10009 1952-04-19 Sumant Peac F 1985-02-18
## 10 10010 1963-06-01 Duangkaew Piveteau F 1989-08-24
Reading in a Table
employees <- dbReadTable(test_db, 'employees') %>% as_tibble()
employees
## # A tibble: 300,024 x 6
## emp_no birth_date first_name last_name gender hire_date
## <int> <chr> <chr> <chr> <chr> <chr>
## 1 10001 1953-09-02 Georgi Facello M 1986-06-26
## 2 10002 1964-06-02 Bezalel Simmel F 1985-11-21
## 3 10003 1959-12-03 Parto Bamford M 1986-08-28
## 4 10004 1954-05-01 Chirstian Koblick M 1986-12-01
## 5 10005 1955-01-21 Kyoichi Maliniak M 1989-09-12
## 6 10006 1953-04-20 Anneke Preusig F 1989-06-02
## 7 10007 1957-05-23 Tzvetan Zielinski F 1989-02-10
## 8 10008 1958-02-19 Saniya Kalloufi M 1994-09-15
## 9 10009 1952-04-19 Sumant Peac F 1985-02-18
## 10 10010 1963-06-01 Duangkaew Piveteau F 1989-08-24
## # … with 300,014 more rows
Select a Subset
female_query <- dbSendQuery(test_db, 'select first_name,last_name from employees where `gender`=\'F\' limit 10')
# Can also use the 'n = ' argument to fetch() to limit the query
females <- fetch(female_query)
dbClearResult(female_query)
## [1] TRUE
females %>% as_tibble()
## # A tibble: 10 x 2
## first_name last_name
## <chr> <chr>
## 1 Bezalel Simmel
## 2 Anneke Preusig
## 3 Tzvetan Zielinski
## 4 Sumant Peac
## 5 Duangkaew Piveteau
## 6 Mary Sluis
## 7 Cristinel Bouloucos
## 8 Kazuhide Peha
## 9 Bojan Montemayor
## 10 Suzette Pettey
dbDisconnect(test_db)
## [1] TRUE
Reading HDF5
- Hierarchical Data Format
- Used for storing large data sets
- Range of data types
- groups containing zero or more data sets and metadata
- Have a group header with group name and a list of attributes
- Have a group symbol table with a list of objects in group
- datasets are multidimensional arrays of elements with metadata
- Have a header with name, datatype, dataspace, and storage layout.
- Havea data array with the data.
library(rhdf5)
filename <- 'data_dir/example.h5'
file.remove(filename)
## [1] TRUE
h5f <- h5createFile(filename)
for (i in c('foo', 'bar', 'foo/blarg')) {
h5f <- h5createGroup(filename, i)
}
h5ls(filename)
## group name otype dclass dim
## 0 / bar H5I_GROUP
## 1 / foo H5I_GROUP
## 2 /foo blarg H5I_GROUP
Writing to Groups
data_a <- tibble(x = rnorm(200), y = 1:200, z = sample(rep(letters[1:20], each = 10)))
head(data_a)
## # A tibble: 6 x 3
## x y z
## <dbl> <int> <chr>
## 1 0.137 1 n
## 2 -0.495 2 h
## 3 0.699 3 t
## 4 -2.44 4 p
## 5 1.57 5 s
## 6 1.41 6 k
data_a %>% class()
## [1] "tbl_df" "tbl" "data.frame"
h5write(data_a, filename, 'foo/test_table')
data_b <- list(a = rnorm(10000), b = letters)
h5write(data_b, filename, 'bar/test_list')
data_c <- c(1:20)
h5write(data_c, filename, 'test_vector')
h5ls(filename)
## group name otype dclass dim
## 0 / bar H5I_GROUP
## 1 /bar test_list H5I_GROUP
## 2 /bar/test_list a H5I_DATASET FLOAT 10000
## 3 /bar/test_list b H5I_DATASET STRING 26
## 4 / foo H5I_GROUP
## 5 /foo blarg H5I_GROUP
## 6 /foo test_table H5I_DATASET COMPOUND 200
## 7 / test_vector H5I_DATASET INTEGER 20
Reading Data
info <- h5read(filename, '/foo/test_table')
head(info)
## x y z
## 1 0.1365751 1 n
## 2 -0.4948272 2 h
## 3 0.6985390 3 t
## 4 -2.4400917 4 p
## 5 1.5667556 5 s
## 6 1.4144548 6 k
info %>% class()
## [1] "data.frame"
Writing and Reading in Chunks
Can write to specific parts of the dataset
h5write(
c(10,10,10,10),
filename,
'test_vector',
index = list(1:4))
h5read(filename, 'test_vector')
## [1] 10 10 10 10 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
Webscraping
Programatically extracting data fro the HTML code of websites.
Using Readlines
uri <- url('http://datascience.study.foletta.org')
site <- readLines(uri)
close(uri)
head(site)
## [1] "<!DOCTYPE html>"
## [2] ""
## [3] "<html xmlns=\"http://www.w3.org/1999/xhtml\">"
## [4] ""
## [5] "<head>"
## [6] ""
Using XML
library(XML)
site <- htmlTreeParse('http://datascience.study.foletta.org', useInternalNodes = T)
xpathSApply(site, '//title')
## [[1]]
## <title>Coursera - Data Science - John Hopkins</title>
Using HTTR
library(httr)
http_response <- GET('http://datascience.study.foletta.org')
content <- content(http_response, as = 'text')
parsedHTML <- htmlParse(content, asText = T)
xpathSApply(parsedHTML, '//title')
## [[1]]
## <title>Coursera - Data Science - John Hopkins</title>
Accessing Sites with Passwords
http_response <- GET('http://httpbin.org/basic-auth/user/password')
http_response
## Response [http://httpbin.org/basic-auth/user/password]
## Date: 2019-10-24 19:51
## Status: 401
## Content-Type: <unknown>
## <EMPTY BODY>
http_response <- GET(
'http://httpbin.org/basic-auth/greg/somepass',
authenticate('greg', 'somepass')
)
http_response
## Response [http://httpbin.org/basic-auth/greg/somepass]
## Date: 2019-10-24 19:51
## Status: 200
## Content-Type: application/json
## Size: 47 B
## {
## "authenticated": true,
## "user": "greg"
## }
Using Handles
ds_hndle <- handle("http://datascience.study.foletta.org")
http_response <- GET(handle = ds_hndle, path = 'C3_Getting_and_Cleaning_Data/w2_notes.html')
http_response
## Response [http://datascience.study.foletta.org/C3_Getting_and_Cleaning_Data/w2_notes.html]
## Date: 2019-10-24 19:51
## Status: 200
## Content-Type: text/html; charset=utf-8
## Size: 95.4 kB
## <!DOCTYPE html>
##
## <html xmlns="http://www.w3.org/1999/xhtml">
##
## <head>
##
## <meta charset="utf-8" />
## <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
## <meta name="generator" content="pandoc" />
## <meta http-equiv="X-UA-Compatible" content="IE=EDGE" />
## ...
Reading from APIs
Can authenticatte with OAuth using the oauth_app()
and sign_oauth1.0()
functions.
Reading from Other Sources
Files
To interact directly with files:
file()
url()
gzfile()
bzfile()
Use ?connections for more information.
Other Languages
The foreign
package is useful for interacting with other languages or statistical programming languages.
read.arff()
- Wekeread.dta()
- Stataread.mtp()
- Minitab
Other Databases
RPostgresSQL
RODBC
provides an interface to multiple databases.RMongo
for Mongo access.
Images
jpeg
readbitmap
png
EBImage
- from BioConductor
GIS Data
rdgal
rgeos
raster
Music and Sound
tuneR
seewave