2. ページのテーブルを取得
# URL 設定
url <- "https://www.mhlw.go.jp/stf/seisakunitsuite/bunya/vaccine_sesshujisseki.html"
# ページを取得する
html <- read_html(url)
html
## {html_document}
## <html lang="ja" class="is-nojs">
## [1] <head prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb# article: htt ...
## [2] <body class="t-mhlw nav03">\r\n<!--\r\n<?ra g="!MB" line="*"?>\r\n-->\r\n ...
# <table> タグがあるノッドを取得
nodes <- html %>%
html_nodes("table")
nodes
## {xml_nodeset (1)}
## [1] <table border="1" cellpadding="1" cellspacing="1" style="width: 100%;"><t ...
# テーブルの中身を取得
table <- nodes %>%
html_table(fill = TRUE)
table
## [[1]]
## # A tibble: 32 x 5
## X1 X2 X3 X4 X5
## <chr> <chr> <chr> <chr> <chr>
## 1 日付 接種回数 内1回目 内2回目 施設数(*)
## 2 2021/02/17 125 125 0 8
## 3 2021/02/18 486 486 0 16
## 4 2021/02/19 4,428 4,428 0 68
## 5 2021/02/22 6,895 6,895 0 95
## 6 2021/02/24 5,954 5,954 0 96
## 7 2021/02/25 4,008 4,008 0 100
## 8 2021/02/26 6,634 6,634 0 100
## 9 2021/03/01 3,255 3,255 0 100
## 10 2021/03/02 2,987 2,987 0 100
## # … with 22 more rows
# テーブルが一つしかないが、リストとして返却されたため、1番目のテーブルを取得
table <- data.table(table[[1]])
# ヘッダーを取得
header <- table[1, ] %>%
as.list() %>%
unlist()
header
## X1 X2 X3 X4 X5
## "日付" "接種回数" "内1回目" "内2回目" "施設数(*)"
# 英語の方が誰でもわかるので、ここは英語のヘッダーを使うことになった
header <- c("date", "total", "first", "second", "facility")
table <- table[2:(nrow(table) - 1)]
colnames(table) <- header
table
## date total first second facility
## 1: 2021/02/17 125 125 0 8
## 2: 2021/02/18 486 486 0 16
## 3: 2021/02/19 4,428 4,428 0 68
## 4: 2021/02/22 6,895 6,895 0 95
## 5: 2021/02/24 5,954 5,954 0 96
## 6: 2021/02/25 4,008 4,008 0 100
## 7: 2021/02/26 6,634 6,634 0 100
## 8: 2021/03/01 3,255 3,255 0 100
## 9: 2021/03/02 2,987 2,987 0 100
## 10: 2021/03/03 2,531 2,531 0 101
## 11: 2021/03/04 1,871 1,871 0 108
## 12: 2021/03/05 7,295 7,295 0 149
## 13: 2021/03/08 24,327 24,327 0 288
## 14: 2021/03/09 36,762 36,762 0 445
## 15: 2021/03/10 41,392 41,357 35 563
## 16: 2021/03/11 32,234 31,826 408 642
## 17: 2021/03/12 49,358 46,453 2,905 767
## 18: 2021/03/15 59,733 55,204 4,529 1,043
## 19: 2021/03/16 68,916 67,446 1,470 1,304
## 20: 2021/03/17 78,294 73,352 4,942 1,505
## 21: 2021/03/18 71,217 67,217 4,000 1,633
## 22: 2021/03/19 70,133 63,041 7,092 1,746
## 23: 2021/03/22 73,863 70,115 3,748 1,869
## 24: 2021/03/23 46,428 43,801 2,627 1,963
## 25: 2021/03/24 42,054 38,731 3,323 2,043
## 26: 2021/03/25 33,942 31,571 2,371 2,092
## 27: 2021/03/26 47,747 43,993 3,754 2,145
## 28: 2021/03/29 67,793 44,039 23,754 2,250
## 29: 2021/03/30 59,069 27,242 31,827 2,356
## 30: 2021/03/31 53,008 24,213 28,795 2,467
## date total first second facility
# 最後は日付と数値の処理
table[, date := gsub(pattern = "/", replacement = "", date)]
table <- table[ , lapply(.SD, function(x){gsub(pattern = ",", replacement = "", x)})]
table
## date total first second facility
## 1: 20210217 125 125 0 8
## 2: 20210218 486 486 0 16
## 3: 20210219 4428 4428 0 68
## 4: 20210222 6895 6895 0 95
## 5: 20210224 5954 5954 0 96
## 6: 20210225 4008 4008 0 100
## 7: 20210226 6634 6634 0 100
## 8: 20210301 3255 3255 0 100
## 9: 20210302 2987 2987 0 100
## 10: 20210303 2531 2531 0 101
## 11: 20210304 1871 1871 0 108
## 12: 20210305 7295 7295 0 149
## 13: 20210308 24327 24327 0 288
## 14: 20210309 36762 36762 0 445
## 15: 20210310 41392 41357 35 563
## 16: 20210311 32234 31826 408 642
## 17: 20210312 49358 46453 2905 767
## 18: 20210315 59733 55204 4529 1043
## 19: 20210316 68916 67446 1470 1304
## 20: 20210317 78294 73352 4942 1505
## 21: 20210318 71217 67217 4000 1633
## 22: 20210319 70133 63041 7092 1746
## 23: 20210322 73863 70115 3748 1869
## 24: 20210323 46428 43801 2627 1963
## 25: 20210324 42054 38731 3323 2043
## 26: 20210325 33942 31571 2371 2092
## 27: 20210326 47747 43993 3754 2145
## 28: 20210329 67793 44039 23754 2250
## 29: 20210330 59069 27242 31827 2356
## 30: 20210331 53008 24213 28795 2467
## date total first second facility