func getPages(url, id string, DB database.MongoDB) { if parser.IsCached(url) { return } defer parser.DeferedCache(url) doc, err := goquery.NewDocument(url) if err != nil { parser.Log.Critical("Problems %s", url) return } _id := strings.Split(url, "nuDeputadoId=") doc.Find("#mesAno option").Each(func(_ int, s *goquery.Selection) { monthYear, ok := s.Attr("value") if ok { dateData := strings.Split(monthYear, "-") if dateData[1] == "2014" { fullQuotasUrl := QUOATAANALITICOURL fullQuotasUrl = strings.Replace(fullQuotasUrl, "MONTH", dateData[0], -1) fullQuotasUrl = strings.Replace(fullQuotasUrl, "YEAR", dateData[1], -1) fullQuotasUrl = strings.Replace(fullQuotasUrl, "ID", _id[1], -1) getQuotaPage(id, fullQuotasUrl, DB) } } }) }
func (p SaveDeputiesFromTransparenciaBrasil) Run(DB database.MongoDB) { source := models.Source{ Url: "http://dev.transparencia.org.br/", Note: "Transparencia Brasil", } if parser.IsCached("http://dev.transparencia.org.br/") { parser.Log.Info("SaveDeputiesFromTransparenciaBrasil Cached") return } defer parser.DeferedCache("http://dev.transparencia.org.br/") parser.Log.Info("Starting SaveDeputiesFromTransparenciaBrasil") c := transparencia.New("kqOfbdNKSlpf") query := map[string]string{ "casa": "1", } parliamenrians, err := c.Excelencias(query) parser.CheckError(err) for _, parliamenrian := range parliamenrians { uri := models.MakeUri(parliamenrian.Apelido) parser.Log.Info("Saving %s", parliamenrian.Nome) _, err := DB.Upsert(bson.M{"id": uri}, bson.M{ "$currentDate": bson.M{ "updatedat": true, }, "$set": bson.M{ "summary": parliamenrian.MiniBio, "nationalidentify": parliamenrian.CPF, }, "$addToSet": bson.M{ "sources": source, "identifiers": bson.M{ "$each": []bson.M{ { "identifier": parliamenrian.Id, "scheme": "TransparenciaBrasilID", }, { "identifier": parliamenrian.CPF, "scheme": "CPF", }, }, }, }, }, models.Parliamentarian{}) parser.CheckError(err) } }
func (p SaveDeputiesFromSearch) Run(DB database.MongoDB) { searchURL := "http://www2.camara.leg.br/deputados/pesquisa" if parser.IsCached(searchURL) { parser.Log.Info("SaveDeputiesFromSearch Cached") return } defer parser.DeferedCache(searchURL) var doc *goquery.Document var e error if doc, e = goquery.NewDocument(searchURL); e != nil { parser.Log.Critical(e.Error()) } source := models.Source{ Url: searchURL, Note: "Pesquisa Câmara", } doc.Find("#deputado option").Each(func(i int, s *goquery.Selection) { value, _ := s.Attr("value") if value != "" { info := regexp.MustCompile("=|%23|!|\\||\\?").Split(value, -1) name := parser.Titlelize(info[0]) q := bson.M{ "id": models.MakeUri(name), } _, err := DB.Upsert(q, bson.M{ "$setOnInsert": bson.M{ "createdat": time.Now(), }, "$currentDate": bson.M{ "updatedat": true, }, "$addToSet": bson.M{ "sources": source, "identifiers": models.Identifier{ Identifier: info[2], Scheme: "nMatricula", }, }, }, models.Parliamentarian{}) parser.CheckError(err) } }) }
func (p SaveDeputiesQuotas) Run(DB database.MongoDB) { url := "http://www.camara.gov.br/cota-parlamentar/pg-cota-lista-deputados.jsp" if parser.IsCached(url) { return } defer parser.DeferedCache(url) doc, err := goquery.NewDocument(url) if err != nil { panic(err) return } doc.Find("#content ul li a").Each(func(_ int, s *goquery.Selection) { url, _ := s.Attr("href") name_party := strings.Split(s.Text(), "-") name := strings.TrimSpace(name_party[0]) id := models.MakeUri(name) if !strings.Contains(id, "lideranca") { getPages(CAMARABASEURL+url, id, DB) } }) }
func saveDeputies(id string, d models.Parliamentarian, DB database.MongoDB) { bioURL := "http://www2.camara.leg.br/deputados/pesquisa/layouts_deputados_biografia?pk=" + id if parser.IsCached(bioURL) { parser.Log.Info("SaveDeputiesAbout(%s) Cached", id) return } source := models.Source{ Url: bioURL, Note: "Pesquisa Câmara", } var doc *goquery.Document var e error if doc, e = goquery.NewDocument(bioURL); e != nil { parser.Log.Critical(e.Error()) } bio := doc.Find("#bioDeputado .bioOutros") biographyItems := make([]string, 0) bio.Each(func(i int, s *goquery.Selection) { title := s.Find(".bioOutrosTitulo").Text() if title != "" { title = strings.TrimSpace(title) title = strings.Replace(title, ":", "", -1) body := s.Find(".bioOutrosTexto").Text() biographyItems = append(biographyItems, title) biographyItems = append(biographyItems, body) biographyItems = append(biographyItems, "") } }) bioDetails := doc.Find("#bioDeputado .bioDetalhes strong") birthdateA := strings.Split(bioDetails.Eq(1).Text(), "/") var year int switch id { case "123756", "160635": year = 1970 case "74230", "129618": year = 1952 case "74665", "141387": year = 1953 case "73933": year = 1959 case "73786": year = 1939 case "74124": year = 1964 case "74447": year = 1936 case "74474": year = 1940 default: parser.Log.Debug("(%s) %s", id, birthdateA) if len(birthdateA) != 3 { parser.Log.Debug("Error, deputies without year %s", bioURL) year = 0 } else { year, _ = strconv.Atoi(birthdateA[2]) } } birthDate := popolo.Date{} if len(birthdateA) > 1 { month, _ := strconv.Atoi(birthdateA[1]) day, _ := strconv.Atoi(birthdateA[0]) loc, _ := time.LoadLocation("America/Sao_Paulo") birthDate = popolo.Date{time.Date(year, time.Month(month), day, 0, 0, 0, 0, loc)} } _, err := DB.Upsert(bson.M{"id": d.Id}, bson.M{ "$setOnInsert": bson.M{ "createdat": time.Now(), }, "$currentDate": bson.M{ "updatedat": true, }, "$set": bson.M{ "summary": bio.Eq(0).Find(".bioOutrosTexto").Text(), "biography": strings.Join(biographyItems, "\n"), "link": "http://www.camara.gov.br/internet/Deputado/dep_Detalhe.asp?id=" + id, "birthdate": birthDate, }, "$addToSet": bson.M{ "sources": source, }, }, models.Parliamentarian{}) parser.CheckError(err) parser.CacheURL(bioURL) }
func (self SaveSenatorsFromIndex) Run(DB database.MongoDB) { indexURL := "http://www.senado.gov.br" if parser.IsCached(indexURL) { parser.Log.Info("SaveSenatorsFromIndex Cached") return } defer parser.DeferedCache(indexURL) source := models.Source{ Url: indexURL, Note: "senado.gov.br website", } var doc *goquery.Document var e error if doc, e = goquery.NewDocument(indexURL + "/senadores/"); e != nil { parser.Log.Critical(e.Error()) } doc.Find("#senadores tbody tr").Each(func(i int, s *goquery.Selection) { data := s.Find("td") name := data.Eq(0).Text() link, okLink := data.Eq(0).Find("a").Attr("href") if !okLink { parser.CheckError(errors.New("link not found")) } else { link = indexURL + link } email, okEmail := data.Eq(6).Find("a").Attr("href") if !okEmail { email = "" } else { email = strings.Replace(email, "mailto:", "", -1) } partyId := models.MakeUri(data.Eq(1).Text()) DB.Upsert(bson.M{"id": partyId}, bson.M{ "$setOnInsert": bson.M{ "createdat": time.Now(), }, "$currentDate": bson.M{ "updatedat": true, }, "$set": bson.M{ "id": partyId, "classification": "party", }, }, &models.Party{}) parliamenrianId := models.MakeUri(name) q := bson.M{ "id": parliamenrianId, } re := regexp.MustCompile("paginst/senador(.+)a.asp") senatorId := re.FindStringSubmatch(link)[1] _, err := DB.Upsert(q, bson.M{ "$setOnInsert": bson.M{ "createdat": time.Now(), }, "$currentDate": bson.M{ "updatedat": true, }, "$addToSet": bson.M{ "sources": source, "contactdetails": bson.M{ "$each": []models.ContactDetail{ { Label: "Telefone", Type: "phone", Value: data.Eq(4).Text(), Sources: []models.Source{source}, }, { Label: "Fax", Type: "fax", Value: data.Eq(5).Text(), Sources: []models.Source{source}, }, }, }, "identifiers": bson.M{ "$each": []models.Identifier{ {Identifier: senatorId, Scheme: "CodSenador"}, }, }, }, "$set": bson.M{ "name": name, "email": email, "link": link, "shortname": models.MakeUri(name), }, }, models.Parliamentarian{}) parser.CheckError(err) docDetails, e := goquery.NewDocument(link) if e != nil { parser.Log.Critical(e.Error()) } info := docDetails.Find(".dadosSenador b") birthdateA := strings.Split(info.Eq(1).Text(), "/") year, _ := strconv.Atoi(birthdateA[2]) month, _ := strconv.Atoi(birthdateA[1]) day, _ := strconv.Atoi(birthdateA[0]) loc, _ := time.LoadLocation("America/Sao_Paulo") birthDate := popolo.Date{time.Date(year, time.Month(month), day, 0, 0, 0, 0, loc)} _, err = DB.Upsert(q, bson.M{ "$setOnInsert": bson.M{ "createdat": time.Now(), }, "$currentDate": bson.M{ "updatedat": true, }, "$set": bson.M{ "birthdate": birthDate, }, "$addToSet": bson.M{ "sources": source, "othernames": models.OtherNames{ Name: info.Eq(0).Text(), Note: "Nome de nascimento", }, "contactdetails": models.ContactDetail{ Label: "Gabinete", Type: "address", Value: info.Eq(4).Text(), Sources: []models.Source{source}, }, }, }, models.Parliamentarian{}) parser.CreateMembermeship(DB, models.Rel{ Id: parliamenrianId, Link: parser.LinkTo("parliamenrians", parliamenrianId), }, models.Rel{ Id: partyId, Link: parser.LinkTo("parties", partyId), }, source, "Filiado", "Partido") parser.CheckError(err) }) }
func (_ SavePartiesFromTSE) Run(DB database.MongoDB) { url := "http://www.tse.jus.br/partidos/partidos-politicos/registrados-no-tse" if parser.IsCached(url) { parser.Log.Info("SavePartiesFromTSE Cached") return } defer parser.DeferedCache(url) source := models.Source{ Url: url, Note: "Tribunal Superior Eleitoral", } var doc *goquery.Document var e error if doc, e = goquery.NewDocument(url); e != nil { parser.Log.Critical(e.Error()) } const ( IDX = iota SIGLA_IDX NAME_IDX DEFERIMENTO_IDX PRESIDENT_IDX N_IDX ) doc.Find("#textoConteudo table tr").Each(func(i int, s *goquery.Selection) { if s.Find(".titulo_tabela").Length() < 6 && s.Find("td").Length() > 1 { info := s.Find("td") parser.Log.Info("%s - %s - %s - %s - %s - %s", info.Eq(IDX).Text(), info.Eq(SIGLA_IDX).Text(), info.Eq(NAME_IDX).Text(), info.Eq(DEFERIMENTO_IDX).Text(), info.Eq(PRESIDENT_IDX).Text(), info.Eq(N_IDX).Text(), ) partyId := models.MakeUri(info.Eq(SIGLA_IDX).Text()) DB.Upsert(bson.M{"id": partyId}, bson.M{ "$setOnInsert": bson.M{ "createdat": time.Now(), }, "$currentDate": bson.M{ "updatedat": true, }, "$set": bson.M{ "id": partyId, "name": parser.Titlelize(info.Eq(NAME_IDX).Text()), "othernames": []bson.M{{ "name": info.Eq(SIGLA_IDX).Text(), }}, "classification": "party", }, "$addToSet": bson.M{ "sources": []models.Source{source}, }, }, &models.Party{}) urlDetails, b := info.Eq(SIGLA_IDX).Find("a").Attr("href") if b { docDetails, err := goquery.NewDocument(urlDetails) if err != nil { parser.Log.Critical(err.Error()) } sourceDetails := models.Source{ Url: urlDetails, Note: "Tribunal Superior Eleitoral", } contactdetails := make([]bson.M, 0) details := docDetails.Find("#ancora-text-um p") address := strings.Split(details.Eq(3).Text(), ":")[1] contactdetails = append(contactdetails, bson.M{ "label": "Endereço", "type": "address", "value": address, "sources": []models.Source{sourceDetails}, }) contactdetails = append(contactdetails, bson.M{ "label": "CEP", "type": "zipcode", "value": findZipcode(0, details), "sources": []models.Source{sourceDetails}, }) phoneString := strings.Split(details.Eq(5).Text(), ":")[1] phone := strings.Split(phoneString, "/")[0] contactdetails = append(contactdetails, bson.M{ "label": "Telefone", "type": "phone", "value": phone, "sources": []models.Source{sourceDetails}, }) faxString := strings.Split(details.Eq(6).Text(), ":")[1] fax := strings.Split(faxString, "/")[0] contactdetails = append(contactdetails, bson.M{ "label": "Fax", "type": "fax", "value": fax, "sources": []models.Source{sourceDetails}, }) website, ok := details.Eq(7).Find("a").Attr("href") if ok { contactdetails = append(contactdetails, bson.M{ "label": "Site", "type": "website", "value": website, "sources": []models.Source{sourceDetails}, }) } details.Eq(8).Find("a").Each(func(i int, ss *goquery.Selection) { email, ok := ss.Attr("href") if !ok { return } contactdetails = append(contactdetails, bson.M{ "label": "Email", "type": "email", "value": email, "sources": []models.Source{sourceDetails}, }) }) data := bson.M{ "$setOnInsert": bson.M{ "createdat": time.Now(), }, "$currentDate": bson.M{ "updatedat": true, }, "$set": bson.M{ "contactdetails": contactdetails, }, } DB.Upsert(bson.M{"id": partyId}, data, models.Party{}) } } }) }
func getQuotaPage(id, url string, DB database.MongoDB) { if parser.IsCached(url) { return } defer parser.DeferedCache(url) <-time.After(2 * time.Second) doc, err := goquery.NewDocument(url) if err != nil { parser.Log.Error(err.Error(), url) return } var p models.Parliamentarian DB.FindOne(bson.M{ "id": id, }, &p) doc.Find(".espacoPadraoInferior2 tr:not(.celulasCentralizadas)").Each(func(i int, s *goquery.Selection) { data := s.Find("td") cnpj := data.Eq(0).Text() if cnpj == "TOTAL" { return } suplier := data.Eq(1).Text() orderN := strings.TrimSpace(data.Eq(2).Text()) companyUri := models.MakeUri(suplier) if cnpj == "" { cnpj = companyUri } _, err := DB.Upsert(bson.M{"id": cnpj}, bson.M{ "$set": bson.M{ "name": suplier, "uri": companyUri, }, }, models.Company{}) parser.CheckError(err) switch len(data.Nodes) { case 4: // value := data.Eq(3).Text() // log.Println("normal:", cnpj, "|", suplier, "|", orderN, value) // log.Println("skip") case 7: sendedAt, _ := time.Parse("2006-01-02", strings.Split(data.Eq(3).Text(), " ")[0]) value := strings.Replace(data.Eq(6).Text(), "R$", "", -1) value = strings.Replace(value, ".", "", -1) value = strings.Replace(value, "-", "", -1) value = strings.TrimSpace(strings.Replace(value, ",", ".", -1)) valueF, _ := strconv.ParseFloat(value, 64) parser.Log.Debug(orderN) orderNS := strings.Split(orderN, ":") var ticket string if len(orderNS) == 1 { ticket = strings.TrimSpace(orderNS[0]) } else { ticket = strings.TrimSpace(orderNS[1]) } _, err = DB.Upsert(bson.M{"order": orderN, "parliamentarian": p.Id}, bson.M{ "$set": bson.M{ "company": cnpj, "date": sendedAt, "passenger_name": data.Eq(4).Text(), "route": data.Eq(5).Text(), "value": valueF, "ticket": ticket, }, }, models.Quota{}) parser.CheckError(err) default: panic(data.Text()) } }) }
func (p SaveDeputiesFromXML) Run(DB database.MongoDB) { xmlURL := "http://www.camara.gov.br/SitCamaraWS/Deputados.asmx/ObterDeputados" if parser.IsCached(xmlURL) { parser.Log.Info("SaveDeputiesFromXML Cached") return } defer parser.DeferedCache(xmlURL) source := models.Source{ Url: xmlURL, Note: "Câmara API", } var doc *goquery.Document var e error if doc, e = goquery.NewDocument(xmlURL); e != nil { parser.Log.Critical(e.Error()) } doc.Find("deputado").Each(func(i int, s *goquery.Selection) { name := parser.Titlelize(s.Find("nomeparlamentar").First().Text()) parser.Log.Info("Saving " + name) partyId := models.MakeUri(s.Find("partido").First().Text()) DB.Upsert(bson.M{"id": partyId}, bson.M{ "$setOnInsert": bson.M{ "createdat": time.Now(), }, "$currentDate": bson.M{ "updatedat": true, }, "$set": bson.M{ "id": partyId, "classification": "party", }, }, &models.Party{}) parliamenrianId := models.MakeUri(name) q := bson.M{ "id": parliamenrianId, } fullName := strings.Split(parser.Titlelize(s.Find("nome").First().Text()), " ") _, err := DB.Upsert(q, bson.M{ "$setOnInsert": bson.M{ "createdat": time.Now(), }, "$currentDate": bson.M{ "updatedat": true, }, "$set": bson.M{ "name": &name, "sortname": &name, "id": models.MakeUri(name), "gender": s.Find("sexo").First().Text(), "image": s.Find("urlFoto").First().Text(), "email": s.Find("email").First().Text(), }, "$addToSet": bson.M{ "sources": source, "identifiers": bson.M{ "$each": []models.Identifier{ {Identifier: s.Find("idParlamentar").First().Text(), Scheme: "idParlamentar"}, {Identifier: s.Find("ideCadastro").First().Text(), Scheme: "ideCadastro"}, }, }, "othernames": models.OtherNames{ Name: parser.Titlelize(s.Find("nome").First().Text()), FamilyName: fullName[len(fullName)-1:][0], GivenName: fullName[0], Note: "Nome de nascimento", }, "contactdetails": bson.M{ "$each": []models.ContactDetail{ { Label: "Telefone", Type: "phone", Value: s.Find("fone").First().Text(), Sources: []models.Source{source}, }, { Label: "Gabinete", Type: "address", Value: s.Find("gabinete").First().Text() + ", Anexo " + s.Find("anexo").First().Text(), Sources: []models.Source{source}, }, }, }, }, }, &models.Parliamentarian{}) parser.CreateMembermeship(DB, models.Rel{ Id: parliamenrianId, Link: parser.LinkTo("parliamenrians", parliamenrianId), }, models.Rel{ Id: partyId, Link: parser.LinkTo("parties", partyId), }, source, "Filiado", "Partido") parser.CheckError(err) }) }