func startSimpleLoad(db *imdb.DB, table string, columns ...string) *simpleLoad { logf("Reading list to populate table %s...", table) tx, err := db.Begin() csql.Panic(err) csql.Truncate(tx, db.Driver, table) ins, err := csql.NewInserter(tx, db.Driver, table, columns...) csql.Panic(err) atoms, err := newAtomizer(db, nil) // read only csql.Panic(err) return &simpleLoad{db, tx, table, 0, ins, atoms} }
// another produces a transaction from tx. It may or may not return the same // transaction depending on the driver being used. func (tx *tx) another() *tx { if tx.db.Driver == "sqlite3" { return tx } txx, err := tx.db.Begin() csql.Panic(err) return wrapTx(tx.db, txx) }
func (sl *simpleLoad) add(line []byte, args ...interface{}) { if err := sl.ins.Exec(args...); err != nil { toStr := func(v interface{}) string { return sf("%#v", v) } logf("Full %s info (that failed to add): %s", sl.table, fun.Map(toStr, args).([]string)) logf("Context: %s", line) csql.Panic(ef("Error adding to %s table: %s", sl.table, err)) } sl.count++ }
// databaseSize returns a pretty string indicating the size of the entire // database on disk. func databaseSize(db *imdb.DB, dsn string) string { if db.Driver == "sqlite3" { fi, err := os.Stat(dsn) csql.Panic(err) return prettyFileSize(fi.Size()) } var size string q := sf("SELECT pg_size_pretty(pg_database_size(current_database()))") csql.Scan(db.QueryRow(q), &size) return size }
// newAtomizer returns an atomizer that can be used to access or create new // atom identifiers. Note that if tx is nil, then the atomizer returned is // read-only (attempting to write will cause a panic). // // A read-only atomizer may be accessed from multiple goroutines // simultaneously, but a read/write atomizer may NOT. // // If a read/write atomizer is created, then the caller is responsible for // closing the transaction (which should be done immediately after a call to // atomizer.Close). // // Note that this function loads the entire set of atoms from the database // into memory, so it is costly. func newAtomizer(db *imdb.DB, tx *sql.Tx) (az *atomizer, err error) { defer csql.Safe(&err) az = &atomizer{db, make(atomMap, 1000000), 0, nil} if tx != nil { var err error az.ins, err = csql.NewInserter( tx, db.Driver, "atom", "id", "hash") csql.Panic(err) } rs := csql.Query(db, "SELECT id, hash FROM atom ORDER BY id ASC") csql.ForRow(rs, az.readRow) az.nextId++ return }
// listLinesSuspended is just like listLines, except it provides a way to // disable filtering lines with '{{SUSPENDED}}' in them. This is useful when // it's necessary to record suspended lines as resetting state associated with // an existing entity. func listLinesSuspended(list io.ReadCloser, suspended bool, do func([]byte)) { seenListName := false nameSuffix := []byte(" LIST") nameSuffix2 := []byte(" TRIVIA") nameSuffix3 := []byte(" RATINGS REPORT") dataStart, dataEnd := []byte("====="), []byte("----------") dataSection := false scanner := bufio.NewScanner(list) for scanner.Scan() { line := scanner.Bytes() if !seenListName { if bytes.HasSuffix(line, nameSuffix) || bytes.HasSuffix(line, nameSuffix2) { seenListName = true } else if bytes.HasSuffix(line, nameSuffix3) { seenListName = true dataSection = true } continue } if !dataSection { if bytes.HasPrefix(line, dataStart) { dataSection = true } continue } if dataSection && bytes.HasPrefix(line, dataEnd) { continue } if !suspended && bytes.Contains(line, attrSuspended) { continue } do(line) } csql.Panic(scanner.Err()) if err := list.Close(); err != nil { logf("Error closing list: %s", err) } }
func (sl *simpleLoad) done() { csql.Panic(sl.ins.Exec()) // inserts anything left in the buffer csql.Panic(sl.tx.Commit()) logf("Done with table %s. Inserted %d rows.", sl.table, sl.count) }
func listMovies(db *imdb.DB, movies io.ReadCloser) (err error) { defer csql.Safe(&err) logf("Reading movies list...") addedMovies, addedTvshows, addedEpisodes := 0, 0, 0 // PostgreSQL wants different transactions for each inserter. // SQLite can't handle them. The wrapper type here ensures that // PostgreSQL gets multiple transactions while SQLite only gets one. tx, err := db.Begin() csql.Panic(err) txmovie := wrapTx(db, tx) txtv := txmovie.another() txepisode := txmovie.another() txname := txmovie.another() txatom := txmovie.another() // Drop data from the movie, tvshow and episode tables. They will be // rebuilt below. // The key here is to leave the atom and name tables alone. Invariably, // they will contain stale data. But the only side effect, I think, is // taking up space. // (Stale data can be removed with 'goim clean'.) csql.Truncate(txmovie, db.Driver, "movie") csql.Truncate(txtv, db.Driver, "tvshow") csql.Truncate(txepisode, db.Driver, "episode") mvIns, err := csql.NewInserter(txmovie.Tx, db.Driver, "movie", "atom_id", "year", "sequence", "tv", "video") csql.Panic(err) tvIns, err := csql.NewInserter(txtv.Tx, db.Driver, "tvshow", "atom_id", "year", "sequence", "year_start", "year_end") csql.Panic(err) epIns, err := csql.NewInserter(txepisode.Tx, db.Driver, "episode", "atom_id", "tvshow_atom_id", "year", "season", "episode_num") csql.Panic(err) nameIns, err := csql.NewInserter(txname.Tx, db.Driver, "name", "atom_id", "name") csql.Panic(err) atoms, err := newAtomizer(db, txatom.Tx) csql.Panic(err) defer func() { csql.Panic(mvIns.Exec()) csql.Panic(tvIns.Exec()) csql.Panic(epIns.Exec()) csql.Panic(nameIns.Exec()) csql.Panic(atoms.Close()) csql.Panic(txmovie.Commit()) csql.Panic(txtv.Commit()) csql.Panic(txepisode.Commit()) csql.Panic(txname.Commit()) csql.Panic(txatom.Commit()) logf("Done. Added %d movies, %d tv shows and %d episodes.", addedMovies, addedTvshows, addedEpisodes) }() listLines(movies, func(line []byte) { line = bytes.TrimSpace(line) fields := splitListLine(line) if len(fields) <= 1 { return } item, value := fields[0], fields[1] switch ent := mediaType(item); ent { case imdb.EntityMovie: m := imdb.Movie{} if !parseMovie(item, &m) { return } if existed, err := parseId(atoms, item, &m.Id); err != nil { csql.Panic(err) } else if !existed { // We only add a name when we add an atom. if err = nameIns.Exec(m.Id, m.Title); err != nil { logf("Full movie info (that failed to add): %#v", m) csql.Panic(ef("Could not add name '%s': %s", m, err)) } } err := mvIns.Exec(m.Id, m.Year, m.Sequence, m.Tv, m.Video) if err != nil { logf("Full movie info (that failed to add): %#v", m) csql.Panic(ef("Could not add movie '%s': %s", m, err)) } addedMovies++ case imdb.EntityTvshow: tv := imdb.Tvshow{} if !parseTvshow(item, &tv) { return } if !parseTvshowRange(value, &tv) { return } if existed, err := parseId(atoms, item, &tv.Id); err != nil { csql.Panic(err) } else if !existed { // We only add a name when we add an atom. if err = nameIns.Exec(tv.Id, tv.Title); err != nil { logf("Full tvshow info (that failed to add): %#v", tv) csql.Panic(ef("Could not add name '%s': %s", tv, err)) } } err := tvIns.Exec(tv.Id, tv.Year, tv.Sequence, tv.YearStart, tv.YearEnd) if err != nil { logf("Full tvshow info (that failed to add): %#v", tv) csql.Panic(ef("Could not add tvshow '%s': %s", tv, err)) } addedTvshows++ case imdb.EntityEpisode: ep := imdb.Episode{} if !parseEpisode(atoms, item, &ep) { return } if !parseEpisodeYear(value, &ep) { return } if existed, err := parseId(atoms, item, &ep.Id); err != nil { csql.Panic(err) } else if !existed { // We only add a name when we add an atom. if err = nameIns.Exec(ep.Id, ep.Title); err != nil { logf("Full episode info (that failed to add): %#v", ep) csql.Panic(ef("Could not add name '%s': %s", ep, err)) } } err := epIns.Exec(ep.Id, ep.TvshowId, ep.Year, ep.Season, ep.EpisodeNum) if err != nil { logf("Full episode info (that failed to add): %#v", ep) csql.Panic(ef("Could not add episode '%s': %s", ep, err)) } addedEpisodes++ default: csql.Panic(ef("Unrecognized entity %s", ent)) } }) return }
func listActs( db *imdb.DB, r io.ReadCloser, atoms *atomizer, added map[imdb.Atom]struct{}, actIns, credIns, nameIns *csql.Inserter, ) (addedActors, addedCredits int) { bunkName, bunkTitles := []byte("Name"), []byte("Titles") bunkLines1, bunkLines2 := []byte("----"), []byte("------") listAttrRows(r, atoms, func(line, idstr, row []byte) { if bytes.Equal(idstr, bunkName) && bytes.Equal(row, bunkTitles) { return } if bytes.Equal(idstr, bunkLines1) && bytes.Equal(row, bunkLines2) { return } var a imdb.Actor existed, err := parseId(atoms, idstr, &a.Id) if err != nil { csql.Panic(err) } if !existed { if !parseActorName(idstr, &a) { logf("Could not parse actor name '%s' in '%s'.", idstr, line) return } // We only add a name when we've added an atom. if err := nameIns.Exec(a.Id, a.FullName); err != nil { csql.Panic(ef("Could not add actor name '%s' from '%s': %s", idstr, line, err)) } } // If we haven't seen this actor before, then insert into actor table. if _, ok := added[a.Id]; !ok { if len(a.FullName) == 0 { if !parseActorName(idstr, &a) { logf("Could not get actor name '%s' in '%s'.", idstr, line) return } } if err := actIns.Exec(a.Id, a.Sequence); err != nil { csql.Panic(ef("Could not add actor info '%#v' from '%s': %s", a, line, err)) } added[a.Id] = struct{}{} addedActors++ } // Reading this list always refreshes the credits. var c credit c.ActorId = a.Id if !parseCredit(atoms, row, &c) { // messages are emitted in parseCredit if something is worth // reporting return } err = credIns.Exec(c.ActorId, c.MediaId, c.Character, c.Position, c.Attrs) if err != nil { csql.Panic(ef("Could not add credit '%s' for '%s': %s", row, idstr, err)) } addedCredits++ }) return }
func listActors(db *imdb.DB, ractor, ractress io.ReadCloser) (err error) { defer csql.Safe(&err) logf("Reading actors list...") // PostgreSQL wants different transactions for each inserter. // SQLite can't handle them. The wrapper type here ensures that // PostgreSQL gets multiple transactions while SQLite only gets one. tx, err := db.Begin() csql.Panic(err) txactor := wrapTx(db, tx) txcredit := txactor.another() txname := txactor.another() txatom := txactor.another() // Drop data from the actor and credit tables. They will be rebuilt below. // The key here is to leave the atom and name tables alone. Invariably, // they will contain stale data. But the only side effect, I think, is // taking up space. // (Stale data can be removed with 'goim clean'.) csql.Truncate(txactor, db.Driver, "actor") csql.Truncate(txcredit.Tx, db.Driver, "credit") actIns, err := csql.NewInserter(txactor.Tx, db.Driver, "actor", "atom_id", "sequence") csql.Panic(err) credIns, err := csql.NewInserter(txcredit.Tx, db.Driver, "credit", "actor_atom_id", "media_atom_id", "character", "position", "attrs") csql.Panic(err) nameIns, err := csql.NewInserter(txname.Tx, db.Driver, "name", "atom_id", "name") csql.Panic(err) atoms, err := newAtomizer(db, txatom.Tx) csql.Panic(err) // Unfortunately, it looks like credits for an actor can appear in // multiple locations. (Or there are different actors that erroneously // have the same name.) added := make(map[imdb.Atom]struct{}, 3000000) n1, nc1 := listActs(db, ractress, atoms, added, actIns, credIns, nameIns) n2, nc2 := listActs(db, ractor, atoms, added, actIns, credIns, nameIns) csql.Panic(actIns.Exec()) csql.Panic(credIns.Exec()) csql.Panic(nameIns.Exec()) csql.Panic(atoms.Close()) csql.Panic(txactor.Commit()) csql.Panic(txcredit.Commit()) csql.Panic(txname.Commit()) csql.Panic(txatom.Commit()) logf("Done. Added %d actors/actresses and %d credits.", n1+n2, nc1+nc2) return }