459 min read

Scraping Wisconsin's Legislative Records

I wanted to explore the voting history of members of Wisconsin’s State House and Senate. Voting history is accessible for one member or one vote at a time on-line, but I couldn’t find any readily available summary data for download. So I thought I’d try scraping the data from the web.

I started by walking through a single example, the Senate’s first vote during the 2015-2016 biennial budget year. I used the rvest R package to read in the page and SelectorGadget to identify the parts of the page to scrape (see the vignette). Then I replaced all of the new line characters, \n, with commas, and eliminated leading, trailing, and multiple commas following this SO post. That left me with relatively clean text to work with.

library(plyr)
library(rvest)
library(tidyverse)

commafix <- function(x, nospace=TRUE) {
  y <- if(nospace) gsub(" ", "", x) else x
  gsub("^,*|(?<=,),|,*$", "", y, perl=TRUE)
}

sv1 <- read_html(
  "http://docs.legis.wisconsin.gov/2015/related/votes/senate/sv0001"
  ) %>% 
  html_nodes("td:nth-child(4), td:nth-child(3), td:nth-child(2), strong") %>%
  html_text() %>%
  gsub("\n", ",", .) %>% 
  commafix()

I then built a data frame from this information.

ayes <- unlist(strsplit(sv1[7:9], ","))
nays <- unlist(strsplit(sv1[11:13], ","))
nots <- unlist(strsplit(sv1[15:17], ","))

df <- data.frame(
  Type=sv1[1],
  Session=substring(sv1[2], 1, 9),
  Issue=sv1[4],
  Action=sv1[5],
  n.Ayes=gsub("[^0-9]", "", sv1[6]),
  n.Nays=gsub("[^0-9]", "", sv1[10]),
  n.Nots=gsub("[^0-9]", "", sv1[14]),
  Representative=c(ayes, nays, nots),
  Vote=rep(c("Aye", "Nay", NA), c(length(ayes), length(nays), length(nots))),
  stringsAsFactors=FALSE
)

This same format was used for all the votes, except that some records were jogged off by one column (an empty column 3). I scraped all 357 senate votes in the 2015-2016 biennium in one fell swoop and aligned the columns.

scrapeit <- function(url) {
  out <- read_html(url) %>% 
    html_nodes("td:nth-child(4), td:nth-child(3), td:nth-child(2), strong") %>%
    html_text() %>%
    gsub("\n", ",", .) %>% 
    gsub("^,*|(?<=,),|,*$", "", ., perl=TRUE) 
  return(out)
}

Senate2015 <- paste0("sv", sprintf("%04d", 1:357))

myfun<- function(.) { paste0("V", .)}

df <- lapply(Senate2015, function(sv) {
    scrapeit(paste0(
      "http://docs.legis.wisconsin.gov/2015/related/votes/senate/", sv)
    )
  }) %>%
  ldply(rbind) %>%
  mutate_all(as.character) %>%
  rename_all(funs(myfun)) %>%
  mutate(
    source = Senate2015,
    ayes.in.5 = substring(V5, 1, 4)=="AYES"
  )

df1 <- df[df$ayes.in.5, ] %>%
  select(-V17)
names(df1) <- paste0("x", 1:18)

df2 <- df[!df$ayes.in.5, ] %>%
  select(-V3)
names(df2) <- paste0("x", 1:18)

dfshort <- rbind(df1, df2) %>%
  rename(
    Source=x17,
    Type=x1,
    Session=x2,
    Issue=x3,
    Action=x4
  ) %>%
  mutate(
    n.Ayes = as.numeric(gsub("[^0-9]", "", x5)),
    n.Nays = as.numeric(gsub("[^0-9]", "", x9)),
    n.Nots = as.numeric(gsub("[^0-9]", "", x13)),
    Ayes = commafix(paste(x6, x7, x8, sep=",")),
    Nays = commafix(paste(x10, x11, x12, sep=",")),
    Nots = commafix(paste(x14, x15, x16, sep=","))
  ) %>%
  select(Source, Type, Session, Issue, Action, n.Ayes, Ayes, n.Nays, Nays, 
    n.Nots, Nots) %>%
  arrange(Source)

library(DT)
datatable(dfshort, options=list(pageLength=5))


More trickery was required to separate out the senator names that were collected together in a single column according to their votes.

dflist <- lapply(1:dim(dfshort)[1], function(i) 
  with(dfshort, {
    reps <- unlist(strsplit(paste(Ayes[i], Nays[i], Nots[i], sep=","), ","))
    reps <- reps[reps!=""]
    vote <- rep(c("Aye", "Nay", NA), c(n.Ayes[i], n.Nays[i], n.Nots[i]))
    data.frame(Source=Source[i], Representative=reps, Vote=vote, stringsAsFactors=FALSE)
  })
  )
dflong <- bind_rows(dflist) %>%
  left_join(dfshort) %>%
  select(Source, Type, Session, Issue, Action, Representative, Vote)

Finally, I scraped another webpage on the same site to get information on each of the senators like their party affiliation and the district they represent. Once again, fields were not aligned, so some manipulations were required.

sv2 <- read_html(
  "http://docs.legis.wisconsin.gov/2015/legislators/senate/") %>%
  html_nodes(".rounded") %>%
  html_text() %>%
  gsub("\n", ",", .) %>% 
  gsub("\r", "", .) %>% 
  commafix(., nospace=FALSE)

clip <- trimws(t(sapply(lapply(sv2, strsplit, ","), function(x) x[[1]][4:9])))
info <- as.data.frame(rbind(clip[grepl("istrict", clip[, 4]), 1:4],
  clip[grepl("istrict", clip[, 5]) & clip[, 1]>"a", c(1:3, 5)],
  clip[grepl("istrict", clip[, 5]) & clip[, 1]<"a", 2:5],
  clip[grepl("istrict", clip[, 6]), c(2:4, 6)]), stringsAsFactors=FALSE) %>%
  filter(
    V1!="Vacant"
  ) %>%
  mutate(
    Last = V1,
    Firstplus = gsub("\\)", "", gsub("-", ":", gsub("\\(", ":", V2))),
    District = as.numeric(gsub("[^0-9]", "", V4)),
    Role = if_else(V3=="", "Senator", V3)
  ) %>%
  separate(col=Firstplus, into=c("First", "Party", "City"), sep=":", 
    fill="left") %>%
  mutate(
    First=trimws(First),
    Party=trimws(Party),
    City=trimws(City)
  ) %>%
  select(Last, First, Party, City, Role, District)

Do I have information for all the names with votes? I used the R package stringdist to do approximate string matching of the names in the voting records and the informational data. Because the string matching function amatch() is case sensitive, I converted all of the names to lower case before matching them.

library(stringdist)

# unique names from voting records
unv <- unique(casefold(dflong$Representative))

# names in voting records that are missing from info
unv[is.na(amatch(unv, casefold(info$Last), maxDist=1))]
## [1] "farrow" "gudex"

I had to add two rows to the information data (from Wikipedia) for the missing senators. Then I combined the senator information with the voting record data and saved the data to a csv file.

info <- rbind(info,
  data.frame(Last="Farrow", First="Paul", Party="R", City="Pewaukee", 
    Role="Senator", District=33),
  data.frame(Last="Gudex", First="Rick", Party="R", City="Fond du Lac", 
    Role="Senator", District=18)) %>%
  arrange(Last)

name.match <- amatch(casefold(dflong$Representative), casefold(info$Last), 
  maxDist=1)
dfcomplete <- cbind(dflong, info[name.match, ]) %>%
  select(-Representative)

write.csv(dfcomplete, "WISenate2015.csv")

datatable(dfcomplete, options=list(pageLength=5))


I wanted to explore the voting history of members of Wisconsin’s State House and Senate, but I ended up spending a lot of time just pulling the data together. So much time, in fact, that I decided to continue with the exploration part in a later post. The Open Legislative Data Report Card, grades state legislatures on how they make their data publicly available. Wisconsin got a D. Only four states got an F: Alabama, Kentucky, Massachusetts, and Nebraska.