コード例 #1
0
ファイル: prune_aliases.go プロジェクト: srom/ensu
func main() {
	// Reading labels.nt.gz stream from stdin.
	gzipReader, err := gzip.NewReader(os.Stdin)
	if err != nil {
		panic(err)
	}

	// Writing RDF into labels_final.nt.gz
	rdfFile, err := os.OpenFile("LABELS_2.nt.gz", os.O_APPEND|os.O_WRONLY, 0666)
	if err != nil {
		rdfFile, err = os.Create("LABELS_2.nt.gz")
		if err != nil {
			panic(err)
		}
	}
	rdfWriter := gzip.NewWriter(rdfFile)

	// Write unique aliases to stdout.
	aliasWriter := bufio.NewWriter(os.Stdout)

	defer func() {
		// Cleanup.
		gzipReader.Close()
		aliasWriter.Flush()
		rdfWriter.Flush()
		rdfWriter.Close()
		rdfFile.Close()
	}()

	// Keep track of seen aliases.
	seenAliases := make(map[string]struct{})

	reAlias := regexp.MustCompile(`^"(.+)"@en$`)
	rePredicate := regexp.MustCompile(
		`^(<http://www.w3.org/2000/01/rdf-schema#label>|` +
			`<http://rdf.basekb.com/ns/common.topic.alias>)$`)

	// put entities into a set in memory
	entities := make(map[string]struct{})
	f, err := os.Open("ENTITIES.txt")
	if err != nil {
		panic(err)
	}
	s := bufio.NewScanner(f)
	for s.Scan() {
		entity := s.Text() // Read entity.
		// Put entity in the set.
		var placeholder struct{}
		entities[entity] = placeholder
	}
	f.Close()

	scanner := bufio.NewScanner(gzipReader)
	for scanner.Scan() {
		t := scanner.Text() // Read line.
		quad, err := nquads.Parse(t)
		if err != nil {
			panic(err)
		}

		if _, ok := entities[quad.Subject]; ok {
			// Entity found.
			if reAlias.MatchString(quad.Object) && rePredicate.MatchString(quad.Predicate) {
				obj := reAlias.FindStringSubmatch(quad.Object)[1]

				fmt.Fprintln(rdfWriter, t)

				if _, seen := seenAliases[obj]; !seen {
					var placeholder struct{}
					seenAliases[obj] = placeholder
					fmt.Fprintf(aliasWriter, "%s\n", obj)
				}
			}
		}
	}
}
コード例 #2
0
ファイル: select_labels.go プロジェクト: srom/ensu
func main() {
	// Reading .nt.gz stream from stdin.
	gzipReader, err := gzip.NewReader(os.Stdin)
	if err != nil {
		panic(err)
	}

	// Writing RDF into labels.nt.gz
	rdfFile, err := os.OpenFile("labels.nt.gz", os.O_APPEND|os.O_WRONLY, 0666)
	if err != nil {
		rdfFile, err = os.Create("labels.nt.gz")
		if err != nil {
			panic(err)
		}
	}
	rdfWriter := gzip.NewWriter(rdfFile)

	// Writing entities into stdout.
	entityWriter := bufio.NewWriter(os.Stdout)

	var writeCount int64

	defer func() {
		// Cleanup
		gzipReader.Close()
		rdfWriter.Flush()
		rdfWriter.Close()
		rdfFile.Close()
		entityWriter.Flush()
	}()

	reAlias := regexp.MustCompile(`^"(.+)"@en$`)
	reEntity := regexp.MustCompile(`^.+ns/m\..+>$`)

	// put aliases into memory
	aliases := make(map[string]struct{})
	f, err := os.Open("aliases.txt")
	if err != nil {
		panic(err)
	}
	s := bufio.NewScanner(f)
	for s.Scan() {
		alias := strings.ToLower(s.Text()) // Read alias.
		// Set in go:
		// programmers.stackexchange.com/questions/177428/sets-data-structure-in-golang
		var placeholder struct{}
		aliases[alias] = placeholder
	}
	f.Close()

	// Keep track of seen entities.
	seenEntity := make(map[string]struct{})

	// Reading RDF lines (N-triples).
	scanner := bufio.NewScanner(gzipReader)
	for scanner.Scan() {
		t := scanner.Text() // Read line.
		quad, err := nquads.Parse(t)
		if err != nil {
			panic(err)
		}

		if reAlias.MatchString(quad.Object) && reEntity.MatchString(quad.Subject) {
			// Object, in english, refering to an entity.
			obj := reAlias.FindStringSubmatch(quad.Object)[1]

			if _, ok := aliases[strings.ToLower(obj)]; ok {
				// It's a match! Save alias, rdf row and entity.
				writeCount++
				if _, seen := seenEntity[quad.Subject]; !seen {
					// Add entity to set.
					var placeholder struct{}
					seenEntity[quad.Subject] = placeholder
					// Write entity.
					fmt.Fprintf(entityWriter, "%s\n", quad.Subject)
				}
				fmt.Fprintln(rdfWriter, t)

				if writeCount%50 == 0 {
					entityWriter.Flush()
					rdfWriter.Flush()
				}
			}
		}
	}
	if err := scanner.Err(); err != nil {
		fmt.Fprintln(os.Stderr, "reading standard input:", err)
	}
}
コード例 #3
0
ファイル: select_types.go プロジェクト: srom/ensu
func main() {
	// Reading .nt.gz stream from stdin.
	gzipReader, err := gzip.NewReader(os.Stdin)
	if err != nil {
		panic(err)
	}

	// Writing RDF into types.nt.gz
	rdfFile, err := os.OpenFile("types_final_ter.nt.gz", os.O_APPEND|os.O_WRONLY, 0666)
	if err != nil {
		rdfFile, err = os.Create("types_final_ter.nt.gz")
		if err != nil {
			panic(err)
		}
	}
	rdfWriter := gzip.NewWriter(rdfFile)

	// Write unique types to stdout.
	typeWriter := bufio.NewWriter(os.Stdout)

	defer func() {
		// Cleanup.
		gzipReader.Close()
		typeWriter.Flush()
		rdfWriter.Flush()
		rdfWriter.Close()
		rdfFile.Close()
	}()

	// topics to consider.
	topics := []string{
		"<http://rdf.basekb.com/ns/government.",
		"<http://rdf.basekb.com/ns/law.",
		"<http://rdf.basekb.com/ns/organization.",
		"<http://rdf.basekb.com/ns/education.",
		"<http://rdf.basekb.com/ns/business.",
		"<http://rdf.basekb.com/ns/people.",
		"<http://rdf.basekb.com/ns/religion.",
		"<http://rdf.basekb.com/ns/military.",
		"<http://rdf.basekb.com/ns/location.",
		"<http://rdf.basekb.com/ns/music.",
		"<http://rdf.basekb.com/ns/book.",
		"<http://rdf.basekb.com/ns/media_common.",
		"<http://rdf.basekb.com/ns/fictional_universe.",
		"<http://rdf.basekb.com/ns/sports.",
		"<http://rdf.basekb.com/ns/internet.",
		"<http://rdf.basekb.com/ns/film.",
		"<http://rdf.basekb.com/ns/tv.",
	}

	// put entities into a set in memory
	entities := make(map[string]struct{})
	f, err := os.Open("entities.txt")
	if err != nil {
		panic(err)
	}
	s := bufio.NewScanner(f)
	for s.Scan() {
		entity := s.Text() // Read entity.
		// Put entity in the set.
		var placeholder struct{}
		entities[entity] = placeholder
	}
	f.Close()

	reEntity := regexp.MustCompile(`^.+ns/m\..+>$`)

	// Keep track of seen types.
	seenTypes := make(map[string]struct{})

	var writeCount int64
	var count int64

	// Reading RDF lines (N-triples).
	scanner := bufio.NewScanner(gzipReader)
	for scanner.Scan() {
		t := scanner.Text() // Read line.
		quad, err := nquads.Parse(t)
		if err != nil {
			panic(err)
		}

		count++
		fmt.Fprintln(os.Stderr, count)

		if reEntity.MatchString(quad.Subject) {
			if _, ok := entities[quad.Subject]; ok {
				// Entity found!
				valid := false
				for _, topicPattern := range topics {
					if strings.Contains(quad.Object, topicPattern) {
						valid = true
						break
					}
				}
				if !valid {
					continue
				}

				writeCount++

				// Append rdf row.
				fmt.Fprintln(rdfWriter, t)

				// Append new types to stdout.
				if _, seen := seenTypes[quad.Object]; !seen {
					var placeholder struct{}
					seenTypes[quad.Object] = placeholder
					fmt.Fprintln(typeWriter, quad.Object)
				}

				if writeCount%50 == 0 {
					typeWriter.Flush()
					rdfWriter.Flush()
				}
			}
		}
	}
}
コード例 #4
0
ファイル: prune_types.go プロジェクト: srom/ensu
func main() {
	// Reading types.nt.gz stream from stdin.
	gzipReader, err := gzip.NewReader(os.Stdin)
	if err != nil {
		panic(err)
	}

	// Writing RDF into labels_final.nt.gz
	rdfFile, err := os.OpenFile("TYPES_2.nt.gz", os.O_APPEND|os.O_WRONLY, 0666)
	if err != nil {
		rdfFile, err = os.Create("TYPES_2.nt.gz")
		if err != nil {
			panic(err)
		}
	}
	rdfWriter := gzip.NewWriter(rdfFile)

	// Write unique entities to stdout.
	typeWriter := bufio.NewWriter(os.Stdout)

	defer func() {
		// Cleanup.
		gzipReader.Close()
		typeWriter.Flush()
		rdfWriter.Flush()
		rdfWriter.Close()
		rdfFile.Close()
	}()

	// put entities into a set in memory
	entities := make(map[string]struct{})
	f, err := os.Open("ENTITIES.txt")
	if err != nil {
		panic(err)
	}
	s := bufio.NewScanner(f)
	for s.Scan() {
		entity := s.Text() // Read entity.
		// Put entity in the set.
		var placeholder struct{}
		entities[entity] = placeholder
	}
	f.Close()

	// Keep track of seen entities.
	seenTypes := make(map[string]struct{})

	count := 0
	scanner := bufio.NewScanner(gzipReader)
	for scanner.Scan() {
		t := scanner.Text() // Read line.
		quad, err := nquads.Parse(t)
		if err != nil {
			panic(err)
		}

		count++
		if count%10000 == 0 {
			fmt.Fprintln(os.Stderr, count)
		}

		if _, ok := entities[quad.Subject]; ok {
			if _, seen := seenTypes[quad.Object]; !seen {
				// Add type to set.
				var placeholder struct{}
				seenTypes[quad.Object] = placeholder
				fmt.Fprintf(typeWriter, "%s\n", quad.Object)
			}
			fmt.Fprintln(rdfWriter, t)
		}
	}
}
コード例 #5
0
ファイル: prune_entities.go プロジェクト: srom/ensu
func main() {
	max := 2000
	pol := false
	pattern := "<http://rdf.basekb.com/ns/tv."

	// Reading types.nt.gz stream from stdin.
	gzipReader, err := gzip.NewReader(os.Stdin)
	if err != nil {
		panic(err)
	}

	// Write unique entities to stdout.
	entityWriter := bufio.NewWriter(os.Stdout)

	defer func() {
		// Cleanup.
		gzipReader.Close()
		entityWriter.Flush()
	}()

	politicians := make(map[string]struct{})
	f, err := os.Open("map_id_freebase_id.csv")
	if err != nil {
		panic(err)
	}
	r := csv.NewReader(f)
	for {
		record, err := r.Read()
		if err == io.EOF {
			break
		} else if err != nil {
			panic(err)
		}
		var placeholder struct{}
		politicians["<"+record[1]+">"] = placeholder
	}
	f.Close()

	// Keep track of seen entities.
	seenEntities := make(map[string]int)

	scanner := bufio.NewScanner(gzipReader)
	for scanner.Scan() {
		t := scanner.Text() // Read line.
		quad, err := nquads.Parse(t)
		if err != nil {
			panic(err)
		}

		if strings.Contains(quad.Object, pattern) {
			if v, seen := seenEntities[quad.Subject]; !seen {
				// Add entity to set.
				seenEntities[quad.Subject] = 1

				if _, ok := politicians[quad.Subject]; ok && pol {
					// Write entity.
					fmt.Fprintf(entityWriter, "%s,%s\n", quad.Object, quad.Subject)
				}
				//fmt.Fprintf(entityWriter, "%s\n", quad.Subject)
			} else {
				if _, ok := politicians[quad.Subject]; !ok {
					// Increment.
					seenEntities[quad.Subject] = v + 1
				}
				// // Increment.
				// seenEntities[quad.Subject] = v + 1
			}
		}
	}

	keys := sortMap(seenEntities)
	for idx, key := range keys {
		if idx+1 > max {
			break
		}
		// Write most common entities.
		fmt.Fprintf(entityWriter, "%s\n", key)
	}
}