func main() { // Reading labels.nt.gz stream from stdin. gzipReader, err := gzip.NewReader(os.Stdin) if err != nil { panic(err) } // Writing RDF into labels_final.nt.gz rdfFile, err := os.OpenFile("LABELS_2.nt.gz", os.O_APPEND|os.O_WRONLY, 0666) if err != nil { rdfFile, err = os.Create("LABELS_2.nt.gz") if err != nil { panic(err) } } rdfWriter := gzip.NewWriter(rdfFile) // Write unique aliases to stdout. aliasWriter := bufio.NewWriter(os.Stdout) defer func() { // Cleanup. gzipReader.Close() aliasWriter.Flush() rdfWriter.Flush() rdfWriter.Close() rdfFile.Close() }() // Keep track of seen aliases. seenAliases := make(map[string]struct{}) reAlias := regexp.MustCompile(`^"(.+)"@en$`) rePredicate := regexp.MustCompile( `^(<http://www.w3.org/2000/01/rdf-schema#label>|` + `<http://rdf.basekb.com/ns/common.topic.alias>)$`) // put entities into a set in memory entities := make(map[string]struct{}) f, err := os.Open("ENTITIES.txt") if err != nil { panic(err) } s := bufio.NewScanner(f) for s.Scan() { entity := s.Text() // Read entity. // Put entity in the set. var placeholder struct{} entities[entity] = placeholder } f.Close() scanner := bufio.NewScanner(gzipReader) for scanner.Scan() { t := scanner.Text() // Read line. quad, err := nquads.Parse(t) if err != nil { panic(err) } if _, ok := entities[quad.Subject]; ok { // Entity found. if reAlias.MatchString(quad.Object) && rePredicate.MatchString(quad.Predicate) { obj := reAlias.FindStringSubmatch(quad.Object)[1] fmt.Fprintln(rdfWriter, t) if _, seen := seenAliases[obj]; !seen { var placeholder struct{} seenAliases[obj] = placeholder fmt.Fprintf(aliasWriter, "%s\n", obj) } } } } }
func main() { // Reading .nt.gz stream from stdin. gzipReader, err := gzip.NewReader(os.Stdin) if err != nil { panic(err) } // Writing RDF into labels.nt.gz rdfFile, err := os.OpenFile("labels.nt.gz", os.O_APPEND|os.O_WRONLY, 0666) if err != nil { rdfFile, err = os.Create("labels.nt.gz") if err != nil { panic(err) } } rdfWriter := gzip.NewWriter(rdfFile) // Writing entities into stdout. entityWriter := bufio.NewWriter(os.Stdout) var writeCount int64 defer func() { // Cleanup gzipReader.Close() rdfWriter.Flush() rdfWriter.Close() rdfFile.Close() entityWriter.Flush() }() reAlias := regexp.MustCompile(`^"(.+)"@en$`) reEntity := regexp.MustCompile(`^.+ns/m\..+>$`) // put aliases into memory aliases := make(map[string]struct{}) f, err := os.Open("aliases.txt") if err != nil { panic(err) } s := bufio.NewScanner(f) for s.Scan() { alias := strings.ToLower(s.Text()) // Read alias. // Set in go: // programmers.stackexchange.com/questions/177428/sets-data-structure-in-golang var placeholder struct{} aliases[alias] = placeholder } f.Close() // Keep track of seen entities. seenEntity := make(map[string]struct{}) // Reading RDF lines (N-triples). scanner := bufio.NewScanner(gzipReader) for scanner.Scan() { t := scanner.Text() // Read line. quad, err := nquads.Parse(t) if err != nil { panic(err) } if reAlias.MatchString(quad.Object) && reEntity.MatchString(quad.Subject) { // Object, in english, refering to an entity. obj := reAlias.FindStringSubmatch(quad.Object)[1] if _, ok := aliases[strings.ToLower(obj)]; ok { // It's a match! Save alias, rdf row and entity. writeCount++ if _, seen := seenEntity[quad.Subject]; !seen { // Add entity to set. var placeholder struct{} seenEntity[quad.Subject] = placeholder // Write entity. fmt.Fprintf(entityWriter, "%s\n", quad.Subject) } fmt.Fprintln(rdfWriter, t) if writeCount%50 == 0 { entityWriter.Flush() rdfWriter.Flush() } } } } if err := scanner.Err(); err != nil { fmt.Fprintln(os.Stderr, "reading standard input:", err) } }
func main() { // Reading .nt.gz stream from stdin. gzipReader, err := gzip.NewReader(os.Stdin) if err != nil { panic(err) } // Writing RDF into types.nt.gz rdfFile, err := os.OpenFile("types_final_ter.nt.gz", os.O_APPEND|os.O_WRONLY, 0666) if err != nil { rdfFile, err = os.Create("types_final_ter.nt.gz") if err != nil { panic(err) } } rdfWriter := gzip.NewWriter(rdfFile) // Write unique types to stdout. typeWriter := bufio.NewWriter(os.Stdout) defer func() { // Cleanup. gzipReader.Close() typeWriter.Flush() rdfWriter.Flush() rdfWriter.Close() rdfFile.Close() }() // topics to consider. topics := []string{ "<http://rdf.basekb.com/ns/government.", "<http://rdf.basekb.com/ns/law.", "<http://rdf.basekb.com/ns/organization.", "<http://rdf.basekb.com/ns/education.", "<http://rdf.basekb.com/ns/business.", "<http://rdf.basekb.com/ns/people.", "<http://rdf.basekb.com/ns/religion.", "<http://rdf.basekb.com/ns/military.", "<http://rdf.basekb.com/ns/location.", "<http://rdf.basekb.com/ns/music.", "<http://rdf.basekb.com/ns/book.", "<http://rdf.basekb.com/ns/media_common.", "<http://rdf.basekb.com/ns/fictional_universe.", "<http://rdf.basekb.com/ns/sports.", "<http://rdf.basekb.com/ns/internet.", "<http://rdf.basekb.com/ns/film.", "<http://rdf.basekb.com/ns/tv.", } // put entities into a set in memory entities := make(map[string]struct{}) f, err := os.Open("entities.txt") if err != nil { panic(err) } s := bufio.NewScanner(f) for s.Scan() { entity := s.Text() // Read entity. // Put entity in the set. var placeholder struct{} entities[entity] = placeholder } f.Close() reEntity := regexp.MustCompile(`^.+ns/m\..+>$`) // Keep track of seen types. seenTypes := make(map[string]struct{}) var writeCount int64 var count int64 // Reading RDF lines (N-triples). scanner := bufio.NewScanner(gzipReader) for scanner.Scan() { t := scanner.Text() // Read line. quad, err := nquads.Parse(t) if err != nil { panic(err) } count++ fmt.Fprintln(os.Stderr, count) if reEntity.MatchString(quad.Subject) { if _, ok := entities[quad.Subject]; ok { // Entity found! valid := false for _, topicPattern := range topics { if strings.Contains(quad.Object, topicPattern) { valid = true break } } if !valid { continue } writeCount++ // Append rdf row. fmt.Fprintln(rdfWriter, t) // Append new types to stdout. if _, seen := seenTypes[quad.Object]; !seen { var placeholder struct{} seenTypes[quad.Object] = placeholder fmt.Fprintln(typeWriter, quad.Object) } if writeCount%50 == 0 { typeWriter.Flush() rdfWriter.Flush() } } } } }
func main() { // Reading types.nt.gz stream from stdin. gzipReader, err := gzip.NewReader(os.Stdin) if err != nil { panic(err) } // Writing RDF into labels_final.nt.gz rdfFile, err := os.OpenFile("TYPES_2.nt.gz", os.O_APPEND|os.O_WRONLY, 0666) if err != nil { rdfFile, err = os.Create("TYPES_2.nt.gz") if err != nil { panic(err) } } rdfWriter := gzip.NewWriter(rdfFile) // Write unique entities to stdout. typeWriter := bufio.NewWriter(os.Stdout) defer func() { // Cleanup. gzipReader.Close() typeWriter.Flush() rdfWriter.Flush() rdfWriter.Close() rdfFile.Close() }() // put entities into a set in memory entities := make(map[string]struct{}) f, err := os.Open("ENTITIES.txt") if err != nil { panic(err) } s := bufio.NewScanner(f) for s.Scan() { entity := s.Text() // Read entity. // Put entity in the set. var placeholder struct{} entities[entity] = placeholder } f.Close() // Keep track of seen entities. seenTypes := make(map[string]struct{}) count := 0 scanner := bufio.NewScanner(gzipReader) for scanner.Scan() { t := scanner.Text() // Read line. quad, err := nquads.Parse(t) if err != nil { panic(err) } count++ if count%10000 == 0 { fmt.Fprintln(os.Stderr, count) } if _, ok := entities[quad.Subject]; ok { if _, seen := seenTypes[quad.Object]; !seen { // Add type to set. var placeholder struct{} seenTypes[quad.Object] = placeholder fmt.Fprintf(typeWriter, "%s\n", quad.Object) } fmt.Fprintln(rdfWriter, t) } } }
func main() { max := 2000 pol := false pattern := "<http://rdf.basekb.com/ns/tv." // Reading types.nt.gz stream from stdin. gzipReader, err := gzip.NewReader(os.Stdin) if err != nil { panic(err) } // Write unique entities to stdout. entityWriter := bufio.NewWriter(os.Stdout) defer func() { // Cleanup. gzipReader.Close() entityWriter.Flush() }() politicians := make(map[string]struct{}) f, err := os.Open("map_id_freebase_id.csv") if err != nil { panic(err) } r := csv.NewReader(f) for { record, err := r.Read() if err == io.EOF { break } else if err != nil { panic(err) } var placeholder struct{} politicians["<"+record[1]+">"] = placeholder } f.Close() // Keep track of seen entities. seenEntities := make(map[string]int) scanner := bufio.NewScanner(gzipReader) for scanner.Scan() { t := scanner.Text() // Read line. quad, err := nquads.Parse(t) if err != nil { panic(err) } if strings.Contains(quad.Object, pattern) { if v, seen := seenEntities[quad.Subject]; !seen { // Add entity to set. seenEntities[quad.Subject] = 1 if _, ok := politicians[quad.Subject]; ok && pol { // Write entity. fmt.Fprintf(entityWriter, "%s,%s\n", quad.Object, quad.Subject) } //fmt.Fprintf(entityWriter, "%s\n", quad.Subject) } else { if _, ok := politicians[quad.Subject]; !ok { // Increment. seenEntities[quad.Subject] = v + 1 } // // Increment. // seenEntities[quad.Subject] = v + 1 } } } keys := sortMap(seenEntities) for idx, key := range keys { if idx+1 > max { break } // Write most common entities. fmt.Fprintf(entityWriter, "%s\n", key) } }