Esempio n. 1
0
func readTitle(td TitleData) string {
	var str string
	var err os.Error

	toFind := fmt.Sprintf("<title>%s</title>", td.Title)

	// Start looking for the title.
	bzr := bzreader.NewBzReader(conf["data_dir"], curdbname, td.Start)

	toFindb := []byte(toFind)
	for {
		bstr, berr := bzr.ReadBytes()
		if berr != nil {
			return ""
		}
		if bytes.Index(bstr, toFindb) >= 0 {
			break
		}
	}

	toFind = "<text"
	for {
		str, err = bzr.ReadString()
		if err != nil {
			return ""
		}
		if strings.Contains(str, toFind) {
			break
		}
	}

	// We found <text> in string. Capture everything after it.
	// It may contain </text>
	matches := wholetextrx.FindStringSubmatch(str)
	if matches != nil {
		return matches[1]
	}

	// Otherwise, it just has <text>
	buffer := bytes.NewBufferString("")

	matches = starttextrx.FindStringSubmatch(str)
	if matches != nil {
		fmt.Fprint(buffer, matches[1])
	}

	toFind = "</text>"
	for {
		str, err = bzr.ReadString()
		if err != nil {
			return ""
		}
		if strings.Contains(str, toFind) {
			break
		}
		fmt.Fprint(buffer, str)
	}

	matches = endtextrx.FindStringSubmatch(str)
	if matches != nil {
		fmt.Fprint(buffer, matches[1])
	}

	return string(buffer.Bytes())
}
Esempio n. 2
0
//
// Generate the new title cache file.
//
func generateNewTitleFile() (string, string) {
	// Create pdata/bzwikipedia.dat.
	dat_file_new := fmt.Sprintf("%v.new", conf["dat_file"])
	dfout, derr := os.OpenFile(dat_file_new, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0666)
	if derr != nil {
		fmt.Printf("Unable to create '%v': %v\n", dat_file_new, derr)
		return "", ""
	}
	defer dfout.Close()

	// Create pdata/titlecache.dat.
	title_file_new := fmt.Sprintf("%v.new", conf["title_file"])
	fout, err := os.OpenFile(title_file_new, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0666)
	if err != nil {
		fmt.Printf("Unable to create '%v': %v\n", title_file_new, derr)
		return "", ""
	}
	defer fout.Close()

	// Plop version and dbname into bzwikipedia.dat
	fmt.Fprintf(dfout, "version:%d\n", current_cache_version)
	fmt.Fprintf(dfout, "dbname:%v\n", curdbname)

	// Now read through all the bzip files looking for <title> bits.
	bzr := bzreader.NewBzReader(conf["data_dir"], curdbname, 1)

	// We print a notice every 1000 chunks, just 'cuz it's more user friendly
	// to show _something_ going on.
	nextprint := 0

	// For title cache version 3:
	//
	// We are using <TITLE_DELIM>titlename<RECORD_DELIM>record, and it is sorted,
	// case sensitively, for binary searching.
	//
	// We are optionally discarding redirects and other titles.
	// Discarding redirects adds a small amount of complexity since we have
	// <title>, then a few lines later <redirect may or may not exist. So
	// we don't add <title> to the array until either A: We see another
	// <title> without seeing <redirect, or we reach end of file.
	//

	// We use make() to force this to create an array of approximately
	// how many items we'll need, so that go isn't constantly reallocating
	// titleslice. 20 million should do it. (As of now, there are over
	// 11 million articles, about half of which are redirects, in
	// pages-articles
	var titleslice = make([]TitleData, 0, 20000000)
	for {
		curindex := bzr.Index
		if curindex >= nextprint {
			nextprint = nextprint + 100
			fmt.Println("Reading chunk", curindex)
		}
		bstr, err := bzr.ReadBytes()
		if err == os.EOF {
			break
		}
		if err != nil {
			fmt.Printf("Error while reading chunk %v: %v\n", bzr.Index, err)
			panic("Unrecoverable error.")
		}

		// This accounts for both "" and is a quick optimization.
		if len(bstr) < 10 {
			continue
		}

		idx := bytes.Index(bstr, []byte("<title>"))

		if idx >= 0 {
			eidx := bytes.Index(bstr, []byte("</title>"))
			if eidx < 0 {
				fmt.Printf("eidx is less than 0 for </title>?\n")
				fmt.Printf("Index %d:%d\n", curindex, bzr.Index)
				fmt.Printf("String is: '%s'\n", bstr)
				panic("Can't find </title> tag - broken bz2?")
			}
			titleslice = append(titleslice, TitleData{
				Title: string(bstr[idx+7 : eidx]),
				Start: curindex,
			})
		}
	}

	tdlist(titleslice).Sort()

	for _, i := range titleslice {
		fmt.Fprintf(fout, "%c%s%c%d", TITLE_DELIM, i.Title, RECORD_DELIM, i.Start)
	}

	fmt.Fprintf(dfout, "rcount:%v\n", len(titleslice))

	return title_file_new, dat_file_new
}