//向sqlCode添加"插入1行数据"的语句,执行前须保证Create()、AddRow()已经执行 //insert into table1(field1,field2) values(rowValues[0],rowValues[1]) func (self *MyTable) Update(db *sql.DB) { if self.tableName != "" { self.sqlCode = `insert into ` + self.tableName + `(` if self.columnNames != nil { for _, v1 := range self.columnNames { self.sqlCode += "`" + v1 + "`" + `,` } self.sqlCode = string(self.sqlCode[:len(self.sqlCode)-1]) self.sqlCode += `)values(` } if self.rowValues != nil { for _, v2 := range self.rowValues { v2 = strings.Replace(v2, `"`, `\"`, -1) self.sqlCode += `"` + v2 + `"` + `,` } self.sqlCode = string(self.sqlCode[:len(self.sqlCode)-1]) self.sqlCode += `);` } } stmt, err := db.Prepare(self.sqlCode) util.CheckErr(err) _, err = stmt.Exec() util.CheckErr(err) // 清空临时数据 self.rowValues = []string{} }
//生成"创建表单"的语句,执行前须保证SetTableName()、AddColumn()已经执行 func (self *MyTable) Create(db *sql.DB) { if self.tableName != "" { self.sqlCode = `create table if not exists ` + self.tableName + `(` self.sqlCode += ` id int(8) not null primary key auto_increment` if self.columnNames != nil { for _, rowValues := range self.columnNames { self.sqlCode += `,` + rowValues + ` varchar(255) not null` } } self.sqlCode += `);` } stmt, err := db.Prepare(self.sqlCode) util.CheckErr(err) _, err = stmt.Exec() util.CheckErr(err) }
//生成"创建表单"的语句,执行前须保证SetTableName()、AddColumn()已经执行 func (self *MyTable) Create() *MyTable { if len(self.columnNames) == 0 { return self } self.sqlCode = `create table if not exists ` + self.tableName + `(` if !self.customPrimaryKey { self.sqlCode += `id int(12) not null primary key auto_increment,` } for _, rowValues := range self.columnNames { self.sqlCode += rowValues[0] + ` ` + rowValues[1] + `,` } self.sqlCode = string(self.sqlCode[:len(self.sqlCode)-1]) self.sqlCode += `);` stmt, err := self.DB.Prepare(self.sqlCode) util.CheckErr(err) _, err = stmt.Exec() util.CheckErr(err) return self }
//智能插入数据,每次1行 func (self *MyTable) AutoInsert(value []string) *MyTable { var nsize int for _, v := range value { nsize += len(v) } if nsize > max_allowed_packet { logs.Log.Error("%v", "packet for query is too large. Try adjusting the 'maxallowedpacket'variable in the 'config.ini'") return self } self.size += nsize if self.size > max_allowed_packet { util.CheckErr(self.FlushInsert()) return self.AutoInsert(value) } return self.addRow(value) }
func init() { Output["mysql"] = func(self *Collector, dataIndex int) { db, ok := mysql.MysqlPool.GetOne().(*mysql.MysqlSrc) if !ok || db == nil { logs.Log.Error("链接Mysql数据库超时,无法输出!") return } defer mysql.MysqlPool.Free(db) var mysqls = make(map[string]*mysql.MyTable) var namespace = util.FileNameReplace(self.namespace()) for _, datacell := range self.DockerQueue.Dockers[dataIndex] { subNamespace := util.FileNameReplace(self.subNamespace(datacell)) var tName = namespace if subNamespace != "" { tName += "__" + subNamespace } if _, ok := mysqls[subNamespace]; !ok { mysqls[subNamespace] = mysql.New(db.DB) mysqls[subNamespace].SetTableName(tName) for _, title := range self.MustGetRule(datacell["RuleName"].(string)).ItemFields { mysqls[subNamespace].AddColumn(title + ` MEDIUMTEXT`) } mysqls[subNamespace]. AddColumn(`Url VARCHAR(255)`, `ParentUrl VARCHAR(255)`, `DownloadTime VARCHAR(50)`). Create() } for _, title := range self.MustGetRule(datacell["RuleName"].(string)).ItemFields { vd := datacell["Data"].(map[string]interface{}) if v, ok := vd[title].(string); ok || vd[title] == nil { mysqls[subNamespace].AddRow(v) } else { mysqls[subNamespace].AddRow(util.JsonString(vd[title])) } } err := mysqls[subNamespace]. AddRow(datacell["Url"].(string), datacell["ParentUrl"].(string), datacell["DownloadTime"].(string)). Update() util.CheckErr(err) } } }
func init() { var ( mysqlTable = map[string]*mysql.MyTable{} mysqlTableLock sync.RWMutex ) var getMysqlTable = func(name string) (*mysql.MyTable, bool) { mysqlTableLock.RLock() defer mysqlTableLock.RUnlock() tab, ok := mysqlTable[name] if ok { return tab.Clone(), true } return nil, false } var setMysqlTable = func(name string, tab *mysql.MyTable) { mysqlTableLock.Lock() mysqlTable[name] = tab mysqlTableLock.Unlock() } DataOutput["mysql"] = func(self *Collector) error { _, err := mysql.DB() if err != nil { return fmt.Errorf("Mysql数据库链接失败: %v", err) } var ( mysqls = make(map[string]*mysql.MyTable) namespace = util.FileNameReplace(self.namespace()) ) for _, datacell := range self.dataDocker { subNamespace := util.FileNameReplace(self.subNamespace(datacell)) tName := joinNamespaces(namespace, subNamespace) table, ok := mysqls[tName] if !ok { table, ok = getMysqlTable(tName) if ok { mysqls[tName] = table } else { table = mysql.New() table.SetTableName(tName) for _, title := range self.MustGetRule(datacell["RuleName"].(string)).ItemFields { table.AddColumn(title + ` MEDIUMTEXT`) } if self.Spider.OutDefaultField() { table.AddColumn(`Url VARCHAR(255)`, `ParentUrl VARCHAR(255)`, `DownloadTime VARCHAR(50)`) } if err := table.Create(); err != nil { logs.Log.Error("%v", err) continue } else { setMysqlTable(tName, table) mysqls[tName] = table } } } data := []string{} for _, title := range self.MustGetRule(datacell["RuleName"].(string)).ItemFields { vd := datacell["Data"].(map[string]interface{}) if v, ok := vd[title].(string); ok || vd[title] == nil { data = append(data, v) } else { data = append(data, util.JsonString(vd[title])) } } if self.Spider.OutDefaultField() { data = append(data, datacell["Url"].(string), datacell["ParentUrl"].(string), datacell["DownloadTime"].(string)) } table.AutoInsert(data) } for _, tab := range mysqls { util.CheckErr(tab.FlushInsert()) } mysqls = nil return nil } }
/************************ Kafka 输出 ***************************/ func init() { var ( kafkaSenders = map[string]*kafka.KafkaSender{} kafkaSenderLock sync.RWMutex ) var getKafkaSender = func(name string) (*kafka.KafkaSender, bool) { kafkaSenderLock.RLock() tab, ok := kafkaSenders[name] kafkaSenderLock.RUnlock() return tab, ok } var setKafkaSender = func(name string, tab *kafka.KafkaSender) { kafkaSenderLock.Lock() kafkaSenders[name] = tab kafkaSenderLock.Unlock() } DataOutput["kafka"] = func(self *Collector) error { _, err := kafka.GetProducer() if err != nil { return fmt.Errorf("kafka producer失败: %v", err) } var ( kafkas = make(map[string]*kafka.KafkaSender) namespace = util.FileNameReplace(self.namespace()) ) for _, datacell := range self.dataDocker { subNamespace := util.FileNameReplace(self.subNamespace(datacell)) topicName := joinNamespaces(namespace, subNamespace) sender, ok := kafkas[topicName] if !ok { sender, ok = getKafkaSender(topicName) if ok { kafkas[topicName] = sender } else { sender = kafka.New() sender.SetTopic(topicName) setKafkaSender(topicName, sender) kafkas[topicName] = sender } } data := make(map[string]interface{}) for _, title := range self.MustGetRule(datacell["RuleName"].(string)).ItemFields { vd := datacell["Data"].(map[string]interface{}) if v, ok := vd[title].(string); ok || vd[title] == nil { data[title] = v } else { data[title] = util.JsonString(vd[title]) } } if self.Spider.OutDefaultField() { data["url"] = datacell["Url"].(string) data["parent_url"] = datacell["ParentUrl"].(string) data["download_time"] = datacell["DownloadTime"].(string) } err := sender.Push(data) util.CheckErr(err) } kafkas = nil return nil } }