Esempio n. 1
0
func startSimpleLoad(db *imdb.DB, table string, columns ...string) *simpleLoad {
	logf("Reading list to populate table %s...", table)

	tx, err := db.Begin()
	csql.Panic(err)
	csql.Truncate(tx, db.Driver, table)
	ins, err := csql.NewInserter(tx, db.Driver, table, columns...)
	csql.Panic(err)
	atoms, err := newAtomizer(db, nil) // read only
	csql.Panic(err)
	return &simpleLoad{db, tx, table, 0, ins, atoms}
}
Esempio n. 2
0
// another produces a transaction from tx. It may or may not return the same
// transaction depending on the driver being used.
func (tx *tx) another() *tx {
	if tx.db.Driver == "sqlite3" {
		return tx
	}
	txx, err := tx.db.Begin()
	csql.Panic(err)
	return wrapTx(tx.db, txx)
}
Esempio n. 3
0
func (sl *simpleLoad) add(line []byte, args ...interface{}) {
	if err := sl.ins.Exec(args...); err != nil {
		toStr := func(v interface{}) string { return sf("%#v", v) }
		logf("Full %s info (that failed to add): %s",
			sl.table, fun.Map(toStr, args).([]string))
		logf("Context: %s", line)
		csql.Panic(ef("Error adding to %s table: %s", sl.table, err))
	}
	sl.count++
}
Esempio n. 4
0
// databaseSize returns a pretty string indicating the size of the entire
// database on disk.
func databaseSize(db *imdb.DB, dsn string) string {
	if db.Driver == "sqlite3" {
		fi, err := os.Stat(dsn)
		csql.Panic(err)
		return prettyFileSize(fi.Size())
	}
	var size string
	q := sf("SELECT pg_size_pretty(pg_database_size(current_database()))")
	csql.Scan(db.QueryRow(q), &size)
	return size
}
Esempio n. 5
0
// newAtomizer returns an atomizer that can be used to access or create new
// atom identifiers. Note that if tx is nil, then the atomizer returned is
// read-only (attempting to write will cause a panic).
//
// A read-only atomizer may be accessed from multiple goroutines
// simultaneously, but a read/write atomizer may NOT.
//
// If a read/write atomizer is created, then the caller is responsible for
// closing the transaction (which should be done immediately after a call to
// atomizer.Close).
//
// Note that this function loads the entire set of atoms from the database
// into memory, so it is costly.
func newAtomizer(db *imdb.DB, tx *sql.Tx) (az *atomizer, err error) {
	defer csql.Safe(&err)

	az = &atomizer{db, make(atomMap, 1000000), 0, nil}
	if tx != nil {
		var err error
		az.ins, err = csql.NewInserter(
			tx, db.Driver, "atom", "id", "hash")
		csql.Panic(err)
	}

	rs := csql.Query(db, "SELECT id, hash FROM atom ORDER BY id ASC")
	csql.ForRow(rs, az.readRow)
	az.nextId++
	return
}
Esempio n. 6
0
// listLinesSuspended is just like listLines, except it provides a way to
// disable filtering lines with '{{SUSPENDED}}' in them. This is useful when
// it's necessary to record suspended lines as resetting state associated with
// an existing entity.
func listLinesSuspended(list io.ReadCloser, suspended bool, do func([]byte)) {
	seenListName := false
	nameSuffix := []byte(" LIST")
	nameSuffix2 := []byte(" TRIVIA")
	nameSuffix3 := []byte(" RATINGS REPORT")
	dataStart, dataEnd := []byte("====="), []byte("----------")
	dataSection := false
	scanner := bufio.NewScanner(list)
	for scanner.Scan() {
		line := scanner.Bytes()
		if !seenListName {
			if bytes.HasSuffix(line, nameSuffix) ||
				bytes.HasSuffix(line, nameSuffix2) {
				seenListName = true
			} else if bytes.HasSuffix(line, nameSuffix3) {
				seenListName = true
				dataSection = true
			}
			continue
		}
		if !dataSection {
			if bytes.HasPrefix(line, dataStart) {
				dataSection = true
			}
			continue
		}
		if dataSection && bytes.HasPrefix(line, dataEnd) {
			continue
		}
		if !suspended && bytes.Contains(line, attrSuspended) {
			continue
		}
		do(line)
	}
	csql.Panic(scanner.Err())
	if err := list.Close(); err != nil {
		logf("Error closing list: %s", err)
	}
}
Esempio n. 7
0
func (sl *simpleLoad) done() {
	csql.Panic(sl.ins.Exec()) // inserts anything left in the buffer
	csql.Panic(sl.tx.Commit())
	logf("Done with table %s. Inserted %d rows.", sl.table, sl.count)
}
Esempio n. 8
0
func listMovies(db *imdb.DB, movies io.ReadCloser) (err error) {
	defer csql.Safe(&err)

	logf("Reading movies list...")
	addedMovies, addedTvshows, addedEpisodes := 0, 0, 0

	// PostgreSQL wants different transactions for each inserter.
	// SQLite can't handle them. The wrapper type here ensures that
	// PostgreSQL gets multiple transactions while SQLite only gets one.
	tx, err := db.Begin()
	csql.Panic(err)

	txmovie := wrapTx(db, tx)
	txtv := txmovie.another()
	txepisode := txmovie.another()
	txname := txmovie.another()
	txatom := txmovie.another()

	// Drop data from the movie, tvshow and episode tables. They will be
	// rebuilt below.
	// The key here is to leave the atom and name tables alone. Invariably,
	// they will contain stale data. But the only side effect, I think, is
	// taking up space.
	// (Stale data can be removed with 'goim clean'.)
	csql.Truncate(txmovie, db.Driver, "movie")
	csql.Truncate(txtv, db.Driver, "tvshow")
	csql.Truncate(txepisode, db.Driver, "episode")

	mvIns, err := csql.NewInserter(txmovie.Tx, db.Driver, "movie",
		"atom_id", "year", "sequence", "tv", "video")
	csql.Panic(err)
	tvIns, err := csql.NewInserter(txtv.Tx, db.Driver, "tvshow",
		"atom_id", "year", "sequence", "year_start", "year_end")
	csql.Panic(err)
	epIns, err := csql.NewInserter(txepisode.Tx, db.Driver, "episode",
		"atom_id", "tvshow_atom_id", "year", "season", "episode_num")
	csql.Panic(err)
	nameIns, err := csql.NewInserter(txname.Tx, db.Driver, "name",
		"atom_id", "name")
	csql.Panic(err)
	atoms, err := newAtomizer(db, txatom.Tx)
	csql.Panic(err)

	defer func() {
		csql.Panic(mvIns.Exec())
		csql.Panic(tvIns.Exec())
		csql.Panic(epIns.Exec())
		csql.Panic(nameIns.Exec())
		csql.Panic(atoms.Close())

		csql.Panic(txmovie.Commit())
		csql.Panic(txtv.Commit())
		csql.Panic(txepisode.Commit())
		csql.Panic(txname.Commit())
		csql.Panic(txatom.Commit())

		logf("Done. Added %d movies, %d tv shows and %d episodes.",
			addedMovies, addedTvshows, addedEpisodes)
	}()

	listLines(movies, func(line []byte) {
		line = bytes.TrimSpace(line)
		fields := splitListLine(line)
		if len(fields) <= 1 {
			return
		}
		item, value := fields[0], fields[1]
		switch ent := mediaType(item); ent {
		case imdb.EntityMovie:
			m := imdb.Movie{}
			if !parseMovie(item, &m) {
				return
			}
			if existed, err := parseId(atoms, item, &m.Id); err != nil {
				csql.Panic(err)
			} else if !existed {
				// We only add a name when we add an atom.
				if err = nameIns.Exec(m.Id, m.Title); err != nil {
					logf("Full movie info (that failed to add): %#v", m)
					csql.Panic(ef("Could not add name '%s': %s", m, err))
				}
			}
			err := mvIns.Exec(m.Id, m.Year, m.Sequence, m.Tv, m.Video)
			if err != nil {
				logf("Full movie info (that failed to add): %#v", m)
				csql.Panic(ef("Could not add movie '%s': %s", m, err))
			}
			addedMovies++
		case imdb.EntityTvshow:
			tv := imdb.Tvshow{}
			if !parseTvshow(item, &tv) {
				return
			}
			if !parseTvshowRange(value, &tv) {
				return
			}
			if existed, err := parseId(atoms, item, &tv.Id); err != nil {
				csql.Panic(err)
			} else if !existed {
				// We only add a name when we add an atom.
				if err = nameIns.Exec(tv.Id, tv.Title); err != nil {
					logf("Full tvshow info (that failed to add): %#v", tv)
					csql.Panic(ef("Could not add name '%s': %s", tv, err))
				}
			}
			err := tvIns.Exec(tv.Id, tv.Year, tv.Sequence,
				tv.YearStart, tv.YearEnd)
			if err != nil {
				logf("Full tvshow info (that failed to add): %#v", tv)
				csql.Panic(ef("Could not add tvshow '%s': %s", tv, err))
			}
			addedTvshows++
		case imdb.EntityEpisode:
			ep := imdb.Episode{}
			if !parseEpisode(atoms, item, &ep) {
				return
			}
			if !parseEpisodeYear(value, &ep) {
				return
			}
			if existed, err := parseId(atoms, item, &ep.Id); err != nil {
				csql.Panic(err)
			} else if !existed {
				// We only add a name when we add an atom.
				if err = nameIns.Exec(ep.Id, ep.Title); err != nil {
					logf("Full episode info (that failed to add): %#v", ep)
					csql.Panic(ef("Could not add name '%s': %s", ep, err))
				}
			}
			err := epIns.Exec(ep.Id, ep.TvshowId, ep.Year,
				ep.Season, ep.EpisodeNum)
			if err != nil {
				logf("Full episode info (that failed to add): %#v", ep)
				csql.Panic(ef("Could not add episode '%s': %s", ep, err))
			}
			addedEpisodes++
		default:
			csql.Panic(ef("Unrecognized entity %s", ent))
		}
	})
	return
}
Esempio n. 9
0
func listActs(
	db *imdb.DB,
	r io.ReadCloser,
	atoms *atomizer,
	added map[imdb.Atom]struct{},
	actIns, credIns, nameIns *csql.Inserter,
) (addedActors, addedCredits int) {
	bunkName, bunkTitles := []byte("Name"), []byte("Titles")
	bunkLines1, bunkLines2 := []byte("----"), []byte("------")

	listAttrRows(r, atoms, func(line, idstr, row []byte) {
		if bytes.Equal(idstr, bunkName) && bytes.Equal(row, bunkTitles) {
			return
		}
		if bytes.Equal(idstr, bunkLines1) && bytes.Equal(row, bunkLines2) {
			return
		}

		var a imdb.Actor
		existed, err := parseId(atoms, idstr, &a.Id)
		if err != nil {
			csql.Panic(err)
		}
		if !existed {
			if !parseActorName(idstr, &a) {
				logf("Could not parse actor name '%s' in '%s'.", idstr, line)
				return
			}

			// We only add a name when we've added an atom.
			if err := nameIns.Exec(a.Id, a.FullName); err != nil {
				csql.Panic(ef("Could not add actor name '%s' from '%s': %s",
					idstr, line, err))
			}
		}

		// If we haven't seen this actor before, then insert into actor table.
		if _, ok := added[a.Id]; !ok {
			if len(a.FullName) == 0 {
				if !parseActorName(idstr, &a) {
					logf("Could not get actor name '%s' in '%s'.", idstr, line)
					return
				}
			}
			if err := actIns.Exec(a.Id, a.Sequence); err != nil {
				csql.Panic(ef("Could not add actor info '%#v' from '%s': %s",
					a, line, err))
			}
			added[a.Id] = struct{}{}
			addedActors++
		}

		// Reading this list always refreshes the credits.
		var c credit
		c.ActorId = a.Id
		if !parseCredit(atoms, row, &c) {
			// messages are emitted in parseCredit if something is worth
			// reporting
			return
		}
		err = credIns.Exec(c.ActorId, c.MediaId,
			c.Character, c.Position, c.Attrs)
		if err != nil {
			csql.Panic(ef("Could not add credit '%s' for '%s': %s",
				row, idstr, err))
		}
		addedCredits++
	})
	return
}
Esempio n. 10
0
func listActors(db *imdb.DB, ractor, ractress io.ReadCloser) (err error) {
	defer csql.Safe(&err)

	logf("Reading actors list...")

	// PostgreSQL wants different transactions for each inserter.
	// SQLite can't handle them. The wrapper type here ensures that
	// PostgreSQL gets multiple transactions while SQLite only gets one.
	tx, err := db.Begin()
	csql.Panic(err)

	txactor := wrapTx(db, tx)
	txcredit := txactor.another()
	txname := txactor.another()
	txatom := txactor.another()

	// Drop data from the actor and credit tables. They will be rebuilt below.
	// The key here is to leave the atom and name tables alone. Invariably,
	// they will contain stale data. But the only side effect, I think, is
	// taking up space.
	// (Stale data can be removed with 'goim clean'.)
	csql.Truncate(txactor, db.Driver, "actor")
	csql.Truncate(txcredit.Tx, db.Driver, "credit")

	actIns, err := csql.NewInserter(txactor.Tx, db.Driver, "actor",
		"atom_id", "sequence")
	csql.Panic(err)
	credIns, err := csql.NewInserter(txcredit.Tx, db.Driver, "credit",
		"actor_atom_id", "media_atom_id", "character", "position", "attrs")
	csql.Panic(err)
	nameIns, err := csql.NewInserter(txname.Tx, db.Driver, "name",
		"atom_id", "name")
	csql.Panic(err)
	atoms, err := newAtomizer(db, txatom.Tx)
	csql.Panic(err)

	// Unfortunately, it looks like credits for an actor can appear in
	// multiple locations. (Or there are different actors that erroneously
	// have the same name.)
	added := make(map[imdb.Atom]struct{}, 3000000)
	n1, nc1 := listActs(db, ractress, atoms, added, actIns, credIns, nameIns)
	n2, nc2 := listActs(db, ractor, atoms, added, actIns, credIns, nameIns)

	csql.Panic(actIns.Exec())
	csql.Panic(credIns.Exec())
	csql.Panic(nameIns.Exec())
	csql.Panic(atoms.Close())

	csql.Panic(txactor.Commit())
	csql.Panic(txcredit.Commit())
	csql.Panic(txname.Commit())
	csql.Panic(txatom.Commit())

	logf("Done. Added %d actors/actresses and %d credits.", n1+n2, nc1+nc2)
	return
}