func init() { var ( mysqlTable = map[string]*mysql.MyTable{} mysqlTableLock sync.RWMutex ) var getMysqlTable = func(name string) (*mysql.MyTable, bool) { mysqlTableLock.RLock() defer mysqlTableLock.RUnlock() tab, ok := mysqlTable[name] if ok { return tab.Clone(), true } return nil, false } var setMysqlTable = func(name string, tab *mysql.MyTable) { mysqlTableLock.Lock() mysqlTable[name] = tab mysqlTableLock.Unlock() } DataOutput["mysql"] = func(self *Collector) error { _, err := mysql.DB() if err != nil { return fmt.Errorf("Mysql数据库链接失败: %v", err) } var ( mysqls = make(map[string]*mysql.MyTable) namespace = util.FileNameReplace(self.namespace()) ) for _, datacell := range self.dataDocker { subNamespace := util.FileNameReplace(self.subNamespace(datacell)) tName := joinNamespaces(namespace, subNamespace) table, ok := mysqls[tName] if !ok { table, ok = getMysqlTable(tName) if ok { mysqls[tName] = table } else { table = mysql.New() table.SetTableName(tName) for _, title := range self.MustGetRule(datacell["RuleName"].(string)).ItemFields { table.AddColumn(title + ` MEDIUMTEXT`) } if self.Spider.OutDefaultField() { table.AddColumn(`Url VARCHAR(255)`, `ParentUrl VARCHAR(255)`, `DownloadTime VARCHAR(50)`) } if err := table.Create(); err != nil { logs.Log.Error("%v", err) continue } else { setMysqlTable(tName, table) mysqls[tName] = table } } } data := []string{} for _, title := range self.MustGetRule(datacell["RuleName"].(string)).ItemFields { vd := datacell["Data"].(map[string]interface{}) if v, ok := vd[title].(string); ok || vd[title] == nil { data = append(data, v) } else { data = append(data, util.JsonString(vd[title])) } } if self.Spider.OutDefaultField() { data = append(data, datacell["Url"].(string), datacell["ParentUrl"].(string), datacell["DownloadTime"].(string)) } table.AutoInsert(data) } for _, tab := range mysqls { util.CheckErr(tab.FlushInsert()) } mysqls = nil return nil } }
// 先清空历史失败记录再更新 func (self *Failure) flush(provider string) (fLen int, err error) { self.RWMutex.Lock() defer self.RWMutex.Unlock() fLen = len(self.list) switch provider { case "mgo": if mgo.Error() != nil { err = fmt.Errorf(" * Fail [添加失败记录][mgo]: %v 条 [ERROR] %v\n", fLen, mgo.Error()) return } mgo.Call(func(src pool.Src) error { c := src.(*mgo.MgoSrc).DB(config.DB_NAME).C(self.tabName) // 删除失败记录文件 c.DropCollection() if fLen == 0 { return nil } var docs = []interface{}{} for key, req := range self.list { docs = append(docs, map[string]interface{}{"_id": key, "failure": req.Serialize()}) } c.Insert(docs...) return nil }) case "mysql": _, err := mysql.DB() if err != nil { return fLen, fmt.Errorf(" * Fail [添加失败记录][mysql]: %v 条 [PING] %v\n", fLen, err) } table, ok := getWriteMysqlTable(self.tabName) if !ok { table = mysql.New() table.SetTableName(self.tabName).CustomPrimaryKey(`id VARCHAR(8) PRIMARY KEY`).AddColumn(`failure MEDIUMTEXT`) setWriteMysqlTable(self.tabName, table) // 创建失败记录表 err = table.Create() if err != nil { return fLen, fmt.Errorf(" * Fail [添加失败记录][mysql]: %v 条 [CREATE] %v\n", fLen, err) } } else { // 清空失败记录表 err = table.Truncate() if err != nil { return fLen, fmt.Errorf(" * Fail [添加失败记录][mysql]: %v 条 [TRUNCATE] %v\n", fLen, err) } } // 添加失败记录 for key, req := range self.list { table.AutoInsert([]string{key, req.Serialize()}) err = table.FlushInsert() if err != nil { fLen-- } } default: // 删除失败记录文件 os.Remove(self.fileName) if fLen == 0 { return } f, _ := os.OpenFile(self.fileName, os.O_CREATE|os.O_WRONLY, 0777) docs := make(map[string]string, len(self.list)) for key, req := range self.list { docs[key] = req.Serialize() } b, _ := json.Marshal(docs) b = bytes.Replace(b, []byte(`\u0026`), []byte(`&`), -1) f.Write(b) f.Close() } return }
// 读取成功记录 func (self *History) ReadSuccess(provider string, inherit bool) { self.RWMutex.Lock() self.provider = provider self.RWMutex.Unlock() if !inherit { // 不继承历史记录时 self.Success.old = make(map[string]bool) self.Success.new = make(map[string]bool) self.Success.inheritable = false return } else if self.Success.inheritable { // 本次与上次均继承历史记录时 return } else { // 上次没有继承历史记录,但本次继承时 self.Success.old = make(map[string]bool) self.Success.new = make(map[string]bool) self.Success.inheritable = true } switch provider { case "mgo": var docs = map[string]interface{}{} err := mgo.Mgo(&docs, "find", map[string]interface{}{ "Database": config.DB_NAME, "Collection": self.Success.tabName, }) if err != nil { logs.Log.Error(" * Fail [读取成功记录][mgo]: %v\n", err) return } for _, v := range docs["Docs"].([]interface{}) { self.Success.old[v.(bson.M)["_id"].(string)] = true } case "mysql": _, err := mysql.DB() if err != nil { logs.Log.Error(" * Fail [读取成功记录][mysql]: %v\n", err) return } table, ok := getReadMysqlTable(self.Success.tabName) if !ok { table = mysql.New().SetTableName(self.Success.tabName) setReadMysqlTable(self.Success.tabName, table) } rows, err := table.SelectAll() if err != nil { return } for rows.Next() { var id string err = rows.Scan(&id) self.Success.old[id] = true } default: f, err := os.Open(self.Success.fileName) if err != nil { return } defer f.Close() b, _ := ioutil.ReadAll(f) if len(b) == 0 { return } b[0] = '{' json.Unmarshal(append(b, '}'), &self.Success.old) } logs.Log.Informational(" * [读取成功记录]: %v 条\n", len(self.Success.old)) }
// 读取失败记录 func (self *History) ReadFailure(provider string, inherit bool) { self.RWMutex.Lock() self.provider = provider self.RWMutex.Unlock() if !inherit { // 不继承历史记录时 self.Failure.list = make(map[*request.Request]bool) self.Failure.inheritable = false return } else if self.Failure.inheritable { // 本次与上次均继承历史记录时 return } else { // 上次没有继承历史记录,但本次继承时 self.Failure.list = make(map[*request.Request]bool) self.Failure.inheritable = true } var fLen int switch provider { case "mgo": if mgo.Error() != nil { logs.Log.Error(" * Fail [读取失败记录][mgo]: %v\n", mgo.Error()) return } var docs = []interface{}{} mgo.Call(func(src pool.Src) error { c := src.(*mgo.MgoSrc).DB(config.DB_NAME).C(self.Failure.tabName) return c.Find(nil).All(&docs) }) fLen = len(docs) for _, v := range docs { failure := v.(bson.M)["_id"].(string) req, err := request.UnSerialize(failure) if err != nil { continue } self.Failure.list[req] = true } case "mysql": _, err := mysql.DB() if err != nil { logs.Log.Error(" * Fail [读取失败记录][mysql]: %v\n", err) return } table, ok := getReadMysqlTable(self.Failure.tabName) if !ok { table = mysql.New().SetTableName(self.Failure.tabName) setReadMysqlTable(self.Failure.tabName, table) } rows, err := table.SelectAll() if err != nil { return } for rows.Next() { var id int var failure string err = rows.Scan(&id, &failure) req, err := request.UnSerialize(failure) if err != nil { continue } self.Failure.list[req] = true fLen++ } default: f, err := os.Open(self.Failure.fileName) if err != nil { return } b, _ := ioutil.ReadAll(f) f.Close() if len(b) == 0 { return } docs := []string{} json.Unmarshal(b, &docs) fLen = len(docs) for _, s := range docs { req, err := request.UnSerialize(s) if err != nil { continue } self.Failure.list[req] = true } } logs.Log.Informational(" * [读取失败记录]: %v 条\n", fLen) }
func (self *Success) flush(provider string) (sLen int, err error) { self.RWMutex.Lock() defer self.RWMutex.Unlock() sLen = len(self.new) if sLen == 0 { return } switch provider { case "mgo": if mgo.Error() != nil { err = fmt.Errorf(" * Fail [添加成功记录][mgo]: %v 条 [ERROR] %v\n", sLen, mgo.Error()) return } var docs = make([]map[string]interface{}, sLen) var i int for key := range self.new { docs[i] = map[string]interface{}{"_id": key} self.old[key] = true i++ } err := mgo.Mgo(nil, "insert", map[string]interface{}{ "Database": config.DB_NAME, "Collection": self.tabName, "Docs": docs, }) if err != nil { err = fmt.Errorf(" * Fail [添加成功记录][mgo]: %v 条 [ERROR] %v\n", sLen, err) } case "mysql": _, err := mysql.DB() if err != nil { return sLen, fmt.Errorf(" * Fail [添加成功记录][mysql]: %v 条 [ERROR] %v\n", sLen, err) } table, ok := getWriteMysqlTable(self.tabName) if !ok { table = mysql.New() table.SetTableName(self.tabName).CustomPrimaryKey(`id VARCHAR(255) not null primary key`) err = table.Create() if err != nil { return sLen, fmt.Errorf(" * Fail [添加成功记录][mysql]: %v 条 [ERROR] %v\n", sLen, err) } setWriteMysqlTable(self.tabName, table) } for key := range self.new { table.AutoInsert([]string{key}) self.old[key] = true } err = table.FlushInsert() if err != nil { return sLen, fmt.Errorf(" * Fail [添加成功记录][mysql]: %v 条 [ERROR] %v\n", sLen, err) } default: f, _ := os.OpenFile(self.fileName, os.O_CREATE|os.O_APPEND|os.O_RDWR, 0777) b, _ := json.Marshal(self.new) b[0] = ',' f.Write(b[:len(b)-1]) f.Close() for key := range self.new { self.old[key] = true } } self.new = make(map[string]bool) return }
// 先清空历史失败记录再更新 func (self *Failure) flush(provider string) (fLen int, err error) { self.RWMutex.Lock() defer self.RWMutex.Unlock() fLen = len(self.list) switch provider { case "mgo": if mgo.Error() != nil { err = fmt.Errorf(" * Fail [添加失败记录][mgo]: %v 条 [ERROR] %v\n", fLen, mgo.Error()) return } mgo.Call(func(src pool.Src) error { c := src.(*mgo.MgoSrc).DB(config.DB_NAME).C(self.tabName) // 删除失败记录文件 c.DropCollection() if fLen == 0 { return nil } var docs = []interface{}{} for req := range self.list { docs = append(docs, map[string]interface{}{"_id": req.Serialize()}) } c.Insert(docs...) return nil }) case "mysql": db, err := mysql.DB() if err != nil { return fLen, fmt.Errorf(" * Fail [添加失败记录][mysql]: %v 条 [ERROR] %v\n", fLen, err) } // 删除失败记录文件 stmt, err := db.Prepare(`DROP TABLE ` + self.tabName) if err != nil { return fLen, fmt.Errorf(" * Fail [添加失败记录][mysql]: %v 条 [ERROR] %v\n", fLen, err) } stmt.Exec() if fLen == 0 { return fLen, nil } table, ok := getWriteMysqlTable(self.tabName) if !ok { table = mysql.New() table.SetTableName(self.tabName).AddColumn(`failure MEDIUMTEXT`) setWriteMysqlTable(self.tabName, table) } // 添加失败请求 err = table.Create() if err != nil { return fLen, fmt.Errorf(" * Fail [添加失败记录][mysql]: %v 条 [ERROR] %v\n", fLen, err) } for req := range self.list { table.AutoInsert([]string{req.Serialize()}) } err = table.FlushInsert() if err != nil { return fLen, fmt.Errorf(" * Fail [添加失败记录][mysql]: %v 条 [ERROR] %v\n", fLen, err) } default: // 删除失败记录文件 os.Remove(self.fileName) if fLen == 0 { return } f, _ := os.OpenFile(self.fileName, os.O_CREATE|os.O_WRONLY, 0777) docs := make([]string, len(self.list)) i := 0 for req := range self.list { docs[i] = req.Serialize() i++ } b, _ := json.Marshal(docs) b = bytes.Replace(b, []byte(`\u0026`), []byte(`&`), -1) f.Write(b) f.Close() } return }