func readTitle(td TitleData) string { var str string var err os.Error toFind := fmt.Sprintf("<title>%s</title>", td.Title) // Start looking for the title. bzr := bzreader.NewBzReader(conf["data_dir"], curdbname, td.Start) toFindb := []byte(toFind) for { bstr, berr := bzr.ReadBytes() if berr != nil { return "" } if bytes.Index(bstr, toFindb) >= 0 { break } } toFind = "<text" for { str, err = bzr.ReadString() if err != nil { return "" } if strings.Contains(str, toFind) { break } } // We found <text> in string. Capture everything after it. // It may contain </text> matches := wholetextrx.FindStringSubmatch(str) if matches != nil { return matches[1] } // Otherwise, it just has <text> buffer := bytes.NewBufferString("") matches = starttextrx.FindStringSubmatch(str) if matches != nil { fmt.Fprint(buffer, matches[1]) } toFind = "</text>" for { str, err = bzr.ReadString() if err != nil { return "" } if strings.Contains(str, toFind) { break } fmt.Fprint(buffer, str) } matches = endtextrx.FindStringSubmatch(str) if matches != nil { fmt.Fprint(buffer, matches[1]) } return string(buffer.Bytes()) }
// // Generate the new title cache file. // func generateNewTitleFile() (string, string) { // Create pdata/bzwikipedia.dat. dat_file_new := fmt.Sprintf("%v.new", conf["dat_file"]) dfout, derr := os.OpenFile(dat_file_new, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0666) if derr != nil { fmt.Printf("Unable to create '%v': %v\n", dat_file_new, derr) return "", "" } defer dfout.Close() // Create pdata/titlecache.dat. title_file_new := fmt.Sprintf("%v.new", conf["title_file"]) fout, err := os.OpenFile(title_file_new, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0666) if err != nil { fmt.Printf("Unable to create '%v': %v\n", title_file_new, derr) return "", "" } defer fout.Close() // Plop version and dbname into bzwikipedia.dat fmt.Fprintf(dfout, "version:%d\n", current_cache_version) fmt.Fprintf(dfout, "dbname:%v\n", curdbname) // Now read through all the bzip files looking for <title> bits. bzr := bzreader.NewBzReader(conf["data_dir"], curdbname, 1) // We print a notice every 1000 chunks, just 'cuz it's more user friendly // to show _something_ going on. nextprint := 0 // For title cache version 3: // // We are using <TITLE_DELIM>titlename<RECORD_DELIM>record, and it is sorted, // case sensitively, for binary searching. // // We are optionally discarding redirects and other titles. // Discarding redirects adds a small amount of complexity since we have // <title>, then a few lines later <redirect may or may not exist. So // we don't add <title> to the array until either A: We see another // <title> without seeing <redirect, or we reach end of file. // // We use make() to force this to create an array of approximately // how many items we'll need, so that go isn't constantly reallocating // titleslice. 20 million should do it. (As of now, there are over // 11 million articles, about half of which are redirects, in // pages-articles var titleslice = make([]TitleData, 0, 20000000) for { curindex := bzr.Index if curindex >= nextprint { nextprint = nextprint + 100 fmt.Println("Reading chunk", curindex) } bstr, err := bzr.ReadBytes() if err == os.EOF { break } if err != nil { fmt.Printf("Error while reading chunk %v: %v\n", bzr.Index, err) panic("Unrecoverable error.") } // This accounts for both "" and is a quick optimization. if len(bstr) < 10 { continue } idx := bytes.Index(bstr, []byte("<title>")) if idx >= 0 { eidx := bytes.Index(bstr, []byte("</title>")) if eidx < 0 { fmt.Printf("eidx is less than 0 for </title>?\n") fmt.Printf("Index %d:%d\n", curindex, bzr.Index) fmt.Printf("String is: '%s'\n", bstr) panic("Can't find </title> tag - broken bz2?") } titleslice = append(titleslice, TitleData{ Title: string(bstr[idx+7 : eidx]), Start: curindex, }) } } tdlist(titleslice).Sort() for _, i := range titleslice { fmt.Fprintf(fout, "%c%s%c%d", TITLE_DELIM, i.Title, RECORD_DELIM, i.Start) } fmt.Fprintf(dfout, "rcount:%v\n", len(titleslice)) return title_file_new, dat_file_new }