Probabilistic Matching Workflow

  1. Preprocessing: developing link keys, extracting information from link keys, normalization of link keys
  2. Reduction of search space: Blocking
  3. Comparison: String metrics, year comparisons, numeric comparisons
  4. Classification: Fellegi-Sunter Model
  5. Final prediction: cut off scores, validation
library(yaml)
library(haven)
library(rvest)
library(RecordLinkage)
## Datos de legisladores
legs <- yaml.load_file("./dta/legislators-current.yaml")

extraer_datos <- function(x) {
    x <- data.frame("official"=x$name$official_full,      
                    "state"=unique(sapply(x$terms, function(i) i$state)),
                    "party"=unique(sapply(x$terms, function(i) i$party)))
    return(x)
}
legs <- do.call(rbind, lapply(legs, extraer_datos))

## Extraer datos de contribuciones a legisladores
mapl <- read_html("http://maplight.org/us-congress/legislator")

links <- mapl %>%
    html_nodes("table") %>%
    html_nodes("a") %>%
    html_attr("href") %>%
    .[-c(1:4)]

data <- mapl %>%
    html_nodes("table") %>%
    .[[1]] %>%
    html_table() 

## Ahora podriamos recuperar la tabla de donantes
donations <- cbind(data, links)
naive <- merge(legs, donations,
               by.x="official",
               by.y="Name",
               all.y=FALSE)
head(naive)
nrow(naive) ## Nos falta mas de 100
## Preparar los datos
## Asegurarse de que las blocking variables son iguales
donations$State <- gsub("^([A-Z]{2})\\-.*", "\\1", donations$State)
legs$state <- as.character(legs$state) ## Que pasaria si fuese un factor
legs$party <- substring(legs$party, 1, 1)
## Por ahora no necesitamos estos datos
donations$Chamber <- NULL
donations$links <- NULL
## Poner las variables en el mismo orden
legs <- legs[, c("official", "party", "state")]
names(donations) <- names(legs)
## No necesitamos ni mayusculas ni puntuacion
donations$official <- tolower(donations$official)
legs$official <- tolower(legs$official)
donations$official <- gsub("[[:punct:]]", "", donations$official)
legs$official <- gsub("[[:punct:]]", "", legs$official)
## Fusion de registros
rpairs <- compare.linkage(legs, donations,                          
                          blockfld=c(2, 3),
                          strcmp=TRUE,
                          strcmpfun=levenshteinSim)

Calculate M and U weights using the EM algorithm

## Crear pesos usando barrera
rdist <- emWeights(rpairs, .6)

Define 0.8 as cut off for string comparators, converting distance metrics into 0/1 binary.

Initial estimates of M and U are set by the algorithm using frequencies of specific values for each column.

## Revision
tail(getPairs(rdist, 12, 11), 24)
## Regla de decision
res <- emClassify(rdist, 11, 10)
## Reconstruir la base de datos y comprobar que todo es correcto
head(res$data1)
head(res$data2)
linked <- cbind(res$pairs[res$prediction == "L", ], res$prediction[res$prediction == "L"])
cbind(res$data1[linked$id1, 'official'], res$data2[linked$id2, 'official'])
check <- cbind(res$pairs[res$prediction == "P", ], res$prediction[res$prediction == "P"])
cbind(res$data1[check$id1, 'official'], res$data2[check$id2, 'official'])

Hacer segue a supervised learning

LS0tIAp0aXRsZTogIkZ1c2nDs24gZGUgcmVnaXN0cm9zIgpkYXRlOiAiYHIgZm9ybWF0KFN5cy50aW1lKCksICclQiAlZCwgJVknKWAiCi0tLQoKYGBge3Igc2V0dXAsIGluY2x1ZGU9RkFMU0UsIGNhY2hlPUZBTFNFfQprbml0cjo6b3B0c19jaHVuayRzZXQoZXZhbCA9IEZBTFNFKSAKa25pdHI6Om9wdHNfY2h1bmskc2V0KGZpZy5wYXRoID0gJy4vYXNzZXRzLycpCmBgYAoKClByb2JhYmlsaXN0aWMgTWF0Y2hpbmcgV29ya2Zsb3cKCjEuIFByZXByb2Nlc3Npbmc6IGRldmVsb3BpbmcgbGluayBrZXlzLCBleHRyYWN0aW5nIGluZm9ybWF0aW9uIGZyb20gbGluayBrZXlzLCBub3JtYWxpemF0aW9uIG9mIGxpbmsga2V5cwoyLiBSZWR1Y3Rpb24gb2Ygc2VhcmNoIHNwYWNlOiBCbG9ja2luZwozLiBDb21wYXJpc29uOiBTdHJpbmcgbWV0cmljcywgeWVhciBjb21wYXJpc29ucywgbnVtZXJpYyBjb21wYXJpc29ucwo0LiBDbGFzc2lmaWNhdGlvbjogRmVsbGVnaS1TdW50ZXIgTW9kZWwKNS4gRmluYWwgcHJlZGljdGlvbjogY3V0IG9mZiBzY29yZXMsIHZhbGlkYXRpb24KCgoKYGBge3J9CmxpYnJhcnkoeWFtbCkKbGlicmFyeShoYXZlbikKbGlicmFyeShydmVzdCkKbGlicmFyeShSZWNvcmRMaW5rYWdlKQpgYGAKCmBgYHtyfQojIyBEYXRvcyBkZSBsZWdpc2xhZG9yZXMKbGVncyA8LSB5YW1sLmxvYWRfZmlsZSgiLi9kdGEvbGVnaXNsYXRvcnMtY3VycmVudC55YW1sIikKCmV4dHJhZXJfZGF0b3MgPC0gZnVuY3Rpb24oeCkgewogICAgeCA8LSBkYXRhLmZyYW1lKCJvZmZpY2lhbCI9eCRuYW1lJG9mZmljaWFsX2Z1bGwsICAgICAgCiAgICAgICAgICAgICAgICAgICAgInN0YXRlIj11bmlxdWUoc2FwcGx5KHgkdGVybXMsIGZ1bmN0aW9uKGkpIGkkc3RhdGUpKSwKICAgICAgICAgICAgICAgICAgICAicGFydHkiPXVuaXF1ZShzYXBwbHkoeCR0ZXJtcywgZnVuY3Rpb24oaSkgaSRwYXJ0eSkpKQogICAgcmV0dXJuKHgpCn0KYGBgCgpgYGB7cn0KbGVncyA8LSBkby5jYWxsKHJiaW5kLCBsYXBwbHkobGVncywgZXh0cmFlcl9kYXRvcykpCgojIyBFeHRyYWVyIGRhdG9zIGRlIGNvbnRyaWJ1Y2lvbmVzIGEgbGVnaXNsYWRvcmVzCm1hcGwgPC0gcmVhZF9odG1sKCJodHRwOi8vbWFwbGlnaHQub3JnL3VzLWNvbmdyZXNzL2xlZ2lzbGF0b3IiKQoKbGlua3MgPC0gbWFwbCAlPiUKICAgIGh0bWxfbm9kZXMoInRhYmxlIikgJT4lCiAgICBodG1sX25vZGVzKCJhIikgJT4lCiAgICBodG1sX2F0dHIoImhyZWYiKSAlPiUKICAgIC5bLWMoMTo0KV0KCmRhdGEgPC0gbWFwbCAlPiUKICAgIGh0bWxfbm9kZXMoInRhYmxlIikgJT4lCiAgICAuW1sxXV0gJT4lCiAgICBodG1sX3RhYmxlKCkgCgojIyBBaG9yYSBwb2RyaWFtb3MgcmVjdXBlcmFyIGxhIHRhYmxhIGRlIGRvbmFudGVzCmRvbmF0aW9ucyA8LSBjYmluZChkYXRhLCBsaW5rcykKYGBgCgoKYGBge3J9Cm5haXZlIDwtIG1lcmdlKGxlZ3MsIGRvbmF0aW9ucywKICAgICAgICAgICAgICAgYnkueD0ib2ZmaWNpYWwiLAogICAgICAgICAgICAgICBieS55PSJOYW1lIiwKICAgICAgICAgICAgICAgYWxsLnk9RkFMU0UpCmhlYWQobmFpdmUpCm5yb3cobmFpdmUpICMjIE5vcyBmYWx0YSBtYXMgZGUgMTAwCmBgYAoKYGBgCiMjIFByZXBhcmFyIGxvcyBkYXRvcwojIyBBc2VndXJhcnNlIGRlIHF1ZSBsYXMgYmxvY2tpbmcgdmFyaWFibGVzIHNvbiBpZ3VhbGVzCmRvbmF0aW9ucyRTdGF0ZSA8LSBnc3ViKCJeKFtBLVpdezJ9KVxcLS4qIiwgIlxcMSIsIGRvbmF0aW9ucyRTdGF0ZSkKbGVncyRzdGF0ZSA8LSBhcy5jaGFyYWN0ZXIobGVncyRzdGF0ZSkgIyMgUXVlIHBhc2FyaWEgc2kgZnVlc2UgdW4gZmFjdG9yCmxlZ3MkcGFydHkgPC0gc3Vic3RyaW5nKGxlZ3MkcGFydHksIDEsIDEpCmBgYAoKYGBgCiMjIFBvciBhaG9yYSBubyBuZWNlc2l0YW1vcyBlc3RvcyBkYXRvcwpkb25hdGlvbnMkQ2hhbWJlciA8LSBOVUxMCmRvbmF0aW9ucyRsaW5rcyA8LSBOVUxMCmBgYAoKYGBgCiMjIFBvbmVyIGxhcyB2YXJpYWJsZXMgZW4gZWwgbWlzbW8gb3JkZW4KbGVncyA8LSBsZWdzWywgYygib2ZmaWNpYWwiLCAicGFydHkiLCAic3RhdGUiKV0KbmFtZXMoZG9uYXRpb25zKSA8LSBuYW1lcyhsZWdzKQpgYGAKCmBgYHtyfQojIyBObyBuZWNlc2l0YW1vcyBuaSBtYXl1c2N1bGFzIG5pIHB1bnR1YWNpb24KZG9uYXRpb25zJG9mZmljaWFsIDwtIHRvbG93ZXIoZG9uYXRpb25zJG9mZmljaWFsKQpsZWdzJG9mZmljaWFsIDwtIHRvbG93ZXIobGVncyRvZmZpY2lhbCkKYGBgCgpgYGB7cn0KZG9uYXRpb25zJG9mZmljaWFsIDwtIGdzdWIoIltbOnB1bmN0Ol1dIiwgIiIsIGRvbmF0aW9ucyRvZmZpY2lhbCkKbGVncyRvZmZpY2lhbCA8LSBnc3ViKCJbWzpwdW5jdDpdXSIsICIiLCBsZWdzJG9mZmljaWFsKQpgYGAKCmBgYHtyfQojIyBGdXNpb24gZGUgcmVnaXN0cm9zCnJwYWlycyA8LSBjb21wYXJlLmxpbmthZ2UobGVncywgZG9uYXRpb25zLCAgICAgICAgICAgICAgICAgICAgICAgICAgCiAgICAgICAgICAgICAgICAgICAgICAgICAgYmxvY2tmbGQ9YygyLCAzKSwKICAgICAgICAgICAgICAgICAgICAgICAgICBzdHJjbXA9VFJVRSwKICAgICAgICAgICAgICAgICAgICAgICAgICBzdHJjbXBmdW49bGV2ZW5zaHRlaW5TaW0pCmBgYAoKQ2FsY3VsYXRlIE0gYW5kIFUgd2VpZ2h0cyB1c2luZyB0aGUgRU0gYWxnb3JpdGhtCgpgYGB7cn0KIyMgQ3JlYXIgcGVzb3MgdXNhbmRvIGJhcnJlcmEKcmRpc3QgPC0gZW1XZWlnaHRzKHJwYWlycywgLjYpCmBgYAoKRGVmaW5lIDAuOCBhcyBjdXQgb2ZmIGZvciBzdHJpbmcgY29tcGFyYXRvcnMsIGNvbnZlcnRpbmcgZGlzdGFuY2UgbWV0cmljcyBpbnRvCjAvMSBiaW5hcnkuCgpJbml0aWFsIGVzdGltYXRlcyBvZiBNIGFuZCBVIGFyZSBzZXQgYnkgdGhlIGFsZ29yaXRobSB1c2luZwpmcmVxdWVuY2llcyBvZiBzcGVjaWZpYyB2YWx1ZXMgZm9yIGVhY2ggY29sdW1uLgoKYGBge3J9CiMjIFJldmlzaW9uCnRhaWwoZ2V0UGFpcnMocmRpc3QsIDEyLCAxMSksIDI0KQpgYGAKCmBgYHtyfQojIyBSZWdsYSBkZSBkZWNpc2lvbgpyZXMgPC0gZW1DbGFzc2lmeShyZGlzdCwgMTEsIDEwKQpgYGAKCmBgYHtyfQojIyBSZWNvbnN0cnVpciBsYSBiYXNlIGRlIGRhdG9zIHkgY29tcHJvYmFyIHF1ZSB0b2RvIGVzIGNvcnJlY3RvCmhlYWQocmVzJGRhdGExKQpoZWFkKHJlcyRkYXRhMikKYGBgCgpgYGB7cn0KbGlua2VkIDwtIGNiaW5kKHJlcyRwYWlyc1tyZXMkcHJlZGljdGlvbiA9PSAiTCIsIF0sIHJlcyRwcmVkaWN0aW9uW3JlcyRwcmVkaWN0aW9uID09ICJMIl0pCmNiaW5kKHJlcyRkYXRhMVtsaW5rZWQkaWQxLCAnb2ZmaWNpYWwnXSwgcmVzJGRhdGEyW2xpbmtlZCRpZDIsICdvZmZpY2lhbCddKQpgYGAKCmBgYHtyfQpjaGVjayA8LSBjYmluZChyZXMkcGFpcnNbcmVzJHByZWRpY3Rpb24gPT0gIlAiLCBdLCByZXMkcHJlZGljdGlvbltyZXMkcHJlZGljdGlvbiA9PSAiUCJdKQpjYmluZChyZXMkZGF0YTFbY2hlY2skaWQxLCAnb2ZmaWNpYWwnXSwgcmVzJGRhdGEyW2NoZWNrJGlkMiwgJ29mZmljaWFsJ10pCmBgYAoKIyMgSGFjZXIgc2VndWUgYSBzdXBlcnZpc2VkIGxlYXJuaW5nCg==