func (p *KafkaSender) Push(data map[string]interface{}) error { val := util.JsonString(data) _, _, err := producer.SendMessage(&sarama.ProducerMessage{ Topic: p.topic, Value: sarama.StringEncoder(val), }) return err }
func init() { Output["mysql"] = func(self *Collector, dataIndex int) { db, ok := mysql.MysqlPool.GetOne().(*mysql.MysqlSrc) if !ok || db == nil { logs.Log.Error("链接Mysql数据库超时,无法输出!") return } defer mysql.MysqlPool.Free(db) var mysqls = make(map[string]*mysql.MyTable) var namespace = util.FileNameReplace(self.namespace()) for _, datacell := range self.DockerQueue.Dockers[dataIndex] { subNamespace := util.FileNameReplace(self.subNamespace(datacell)) var tName = namespace if subNamespace != "" { tName += "__" + subNamespace } if _, ok := mysqls[subNamespace]; !ok { mysqls[subNamespace] = mysql.New(db.DB) mysqls[subNamespace].SetTableName(tName) for _, title := range self.MustGetRule(datacell["RuleName"].(string)).ItemFields { mysqls[subNamespace].AddColumn(title + ` MEDIUMTEXT`) } mysqls[subNamespace]. AddColumn(`Url VARCHAR(255)`, `ParentUrl VARCHAR(255)`, `DownloadTime VARCHAR(50)`). Create() } for _, title := range self.MustGetRule(datacell["RuleName"].(string)).ItemFields { vd := datacell["Data"].(map[string]interface{}) if v, ok := vd[title].(string); ok || vd[title] == nil { mysqls[subNamespace].AddRow(v) } else { mysqls[subNamespace].AddRow(util.JsonString(vd[title])) } } err := mysqls[subNamespace]. AddRow(datacell["Url"].(string), datacell["ParentUrl"].(string), datacell["DownloadTime"].(string)). Update() util.CheckErr(err) } } }
func init() { Output["mysql"] = func(self *Collector, dataIndex int) { db := mysql.MysqlPool.GetOne().(*mysql.MysqlFish) defer mysql.MysqlPool.Free(db) var newMysql = new(mysql.MyTable) for Name, Rule := range self.GetRules() { //跳过不输出的数据 if len(Rule.GetOutFeild()) == 0 { continue } newMysql.SetTableName("`" + tabName(self, Name) + "`") for _, title := range Rule.GetOutFeild() { newMysql.AddColumn(title) } newMysql.AddColumn("当前连接", "上级链接", "下载时间"). Create(db.DB) num := 0 //小计 for _, datacell := range self.DockerQueue.Dockers[dataIndex] { if datacell["RuleName"].(string) == Name { for _, title := range Rule.GetOutFeild() { vd := datacell["Data"].(map[string]interface{}) if v, ok := vd[title].(string); ok || vd[title] == nil { newMysql.AddRow(v) } else { newMysql.AddRow(util.JsonString(vd[title])) } } newMysql.AddRow(datacell["Url"].(string), datacell["ParentUrl"].(string), datacell["DownloadTime"].(string)). Update(db.DB) num++ } } newMysql = new(mysql.MyTable) } } }
func init() { Output["mysql"] = func(self *Collector, dataIndex int) { db, ok := mysql.MysqlPool.GetOne().(*mysql.MysqlSrc) if !ok || db == nil { logs.Log.Error("链接Mysql数据库超时,无法输出!") return } defer mysql.MysqlPool.Free(db) var mysqls = make(map[string]*mysql.MyTable) var namespace = util.FileNameReplace(self.namespace()) for _, datacell := range self.DockerQueue.Dockers[dataIndex] { subNamespace := util.FileNameReplace(self.subNamespace(datacell)) if _, ok := mysqls[subNamespace]; !ok { mysqls[subNamespace] = mysql.New(db.DB) mysqls[subNamespace].SetTableName("`" + namespace + "__" + subNamespace + "`") for _, title := range self.GetRule(datacell["RuleName"].(string)).GetOutFeild() { mysqls[subNamespace].AddColumn(title) } mysqls[subNamespace]. AddColumn("Url", "ParentUrl", "DownloadTime"). Create() } for _, title := range self.GetRule(datacell["RuleName"].(string)).GetOutFeild() { vd := datacell["Data"].(map[string]interface{}) if v, ok := vd[title].(string); ok || vd[title] == nil { mysqls[subNamespace].AddRow(v) } else { mysqls[subNamespace].AddRow(util.JsonString(vd[title])) } } mysqls[subNamespace]. AddRow(datacell["Url"].(string), datacell["ParentUrl"].(string), datacell["DownloadTime"].(string)). Update() } } }
/************************ excel 输出 ***************************/ func init() { Output["excel"] = func(self *Collector, dataIndex int) { defer func() { if err := recover(); err != nil { Log.Println(err) } }() var file *xlsx.File var sheet *xlsx.Sheet var row *xlsx.Row var cell *xlsx.Cell var err error folder1 := "result/data" folder2 := folder1 + "/" + self.startTime.Format("2006年01月02日 15时04分05秒") filename := folder2 + "/" + util.FileNameReplace(self.Spider.GetName()+"_"+self.Spider.GetKeyword()+" "+fmt.Sprintf("%v", self.sum[0])+"-"+fmt.Sprintf("%v", self.sum[1])) + ".xlsx" // 创建文件 file = xlsx.NewFile() // 添加分类数据工作表 for Name, Rule := range self.GetRules() { // 跳过不输出的数据 if len(Rule.GetOutFeild()) == 0 { continue } // 添加工作表 sheet = file.AddSheet(util.ExcelSheetNameReplace(Name)) // 写入表头 row = sheet.AddRow() for _, title := range Rule.GetOutFeild() { cell = row.AddCell() cell.Value = title } cell = row.AddCell() cell.Value = "当前链接" cell = row.AddCell() cell.Value = "上级链接" cell = row.AddCell() cell.Value = "下载时间" num := 0 //小计 for _, datacell := range self.DockerQueue.Dockers[dataIndex] { if datacell["RuleName"].(string) == Name { row = sheet.AddRow() for _, title := range Rule.GetOutFeild() { cell = row.AddCell() vd := datacell["Data"].(map[string]interface{}) if v, ok := vd[title].(string); ok || vd[title] == nil { cell.Value = v } else { cell.Value = util.JsonString(vd[title]) } } cell = row.AddCell() cell.Value = datacell["Url"].(string) cell = row.AddCell() cell.Value = datacell["ParentUrl"].(string) cell = row.AddCell() cell.Value = datacell["DownloadTime"].(string) num++ } } // Log.Printf("[任务:%v | 关键词:%v | 小类:%v] 输出 %v 条数据!!!\n", self.Spider.GetName(), self.Spider.GetKeyword(), Name, num) } // 创建/打开目录 f2, err := os.Stat(folder2) if err != nil || !f2.IsDir() { if err := os.MkdirAll(folder2, 0777); err != nil { Log.Printf("Error: %v\n", err) } } // 保存文件 err = file.Save(filename) if err != nil { Log.Println(err) } } }
func init() { var ( mysqlTable = map[string]*mysql.MyTable{} mysqlTableLock sync.RWMutex ) var getMysqlTable = func(name string) (*mysql.MyTable, bool) { mysqlTableLock.RLock() defer mysqlTableLock.RUnlock() tab, ok := mysqlTable[name] if ok { return tab.Clone(), true } return nil, false } var setMysqlTable = func(name string, tab *mysql.MyTable) { mysqlTableLock.Lock() mysqlTable[name] = tab mysqlTableLock.Unlock() } DataOutput["mysql"] = func(self *Collector) error { _, err := mysql.DB() if err != nil { return fmt.Errorf("Mysql数据库链接失败: %v", err) } var ( mysqls = make(map[string]*mysql.MyTable) namespace = util.FileNameReplace(self.namespace()) ) for _, datacell := range self.dataDocker { subNamespace := util.FileNameReplace(self.subNamespace(datacell)) tName := joinNamespaces(namespace, subNamespace) table, ok := mysqls[tName] if !ok { table, ok = getMysqlTable(tName) if ok { mysqls[tName] = table } else { table = mysql.New() table.SetTableName(tName) for _, title := range self.MustGetRule(datacell["RuleName"].(string)).ItemFields { table.AddColumn(title + ` MEDIUMTEXT`) } if self.Spider.OutDefaultField() { table.AddColumn(`Url VARCHAR(255)`, `ParentUrl VARCHAR(255)`, `DownloadTime VARCHAR(50)`) } if err := table.Create(); err != nil { logs.Log.Error("%v", err) continue } else { setMysqlTable(tName, table) mysqls[tName] = table } } } data := []string{} for _, title := range self.MustGetRule(datacell["RuleName"].(string)).ItemFields { vd := datacell["Data"].(map[string]interface{}) if v, ok := vd[title].(string); ok || vd[title] == nil { data = append(data, v) } else { data = append(data, util.JsonString(vd[title])) } } if self.Spider.OutDefaultField() { data = append(data, datacell["Url"].(string), datacell["ParentUrl"].(string), datacell["DownloadTime"].(string)) } table.AutoInsert(data) } for _, tab := range mysqls { util.CheckErr(tab.FlushInsert()) } mysqls = nil return nil } }
/************************ CSV 输出 ***************************/ func init() { Output["csv"] = func(self *Collector, dataIndex int) { defer func() { if err := recover(); err != nil { logs.Log.Error("%v", err) } }() var namespace = util.FileNameReplace(self.namespace()) var sheets = make(map[string]*csv.Writer) for _, datacell := range self.DockerQueue.Dockers[dataIndex] { var subNamespace = util.FileNameReplace(self.subNamespace(datacell)) if _, ok := sheets[subNamespace]; !ok { folder := config.COMM_PATH.TEXT + "/" + cache.StartTime.Format("2006年01月02日 15时04分05秒") + "/" + namespace + "__" + subNamespace filename := fmt.Sprintf("%v/%v-%v.csv", folder, self.sum[0], self.sum[1]) // 创建/打开目录 f, err := os.Stat(folder) if err != nil || !f.IsDir() { if err := os.MkdirAll(folder, 0777); err != nil { logs.Log.Error("Error: %v\n", err) } } // 按数据分类创建文件 file, err := os.Create(filename) if err != nil { logs.Log.Error("%v", err) continue } file.WriteString("\xEF\xBB\xBF") // 写入UTF-8 BOM sheets[subNamespace] = csv.NewWriter(file) th := self.MustGetRule(datacell["RuleName"].(string)).ItemFields th = append(th, "当前链接", "上级链接", "下载时间") sheets[subNamespace].Write(th) defer func(file *os.File) { // 发送缓存数据流 sheets[subNamespace].Flush() // 关闭文件 file.Close() }(file) } row := []string{} for _, title := range self.MustGetRule(datacell["RuleName"].(string)).ItemFields { vd := datacell["Data"].(map[string]interface{}) if v, ok := vd[title].(string); ok || vd[title] == nil { row = append(row, v) } else { row = append(row, util.JsonString(vd[title])) } } row = append(row, datacell["Url"].(string)) row = append(row, datacell["ParentUrl"].(string)) row = append(row, datacell["DownloadTime"].(string)) sheets[subNamespace].Write(row) } } }
// 请求的序列化 func (self *Request) Serialization() string { return util.JsonString(self) }
/************************ CSV 输出 ***************************/ func init() { Output["csv"] = func(self *Collector, dataIndex int) { defer func() { if err := recover(); err != nil { Log.Println(err) } }() folder1 := "result/data" folder2 := folder1 + "/" + self.startTime.Format("2006年01月02日 15时04分05秒") filenameBase := folder2 + "/" + util.FileNameReplace(self.Spider.GetName()+"_"+self.Spider.GetKeyword()+" "+fmt.Sprintf("%v", self.sum[0])+"-"+fmt.Sprintf("%v", self.sum[1])) // 创建/打开目录 f2, err := os.Stat(folder2) if err != nil || !f2.IsDir() { if err := os.MkdirAll(folder2, 0777); err != nil { Log.Printf("Error: %v\n", err) } } // 按数据分类创建文件 for Name, Rule := range self.GetRules() { // 跳过不输出的数据 if len(Rule.GetOutFeild()) == 0 { continue } file, err := os.Create(filenameBase + " (" + util.FileNameReplace(Name) + ").csv") if err != nil { Log.Println(err) continue } file.WriteString("\xEF\xBB\xBF") // 写入UTF-8 BOM w := csv.NewWriter(file) th := Rule.GetOutFeild() th = append(th, []string{"当前链接", "上级链接", "下载时间"}...) w.Write(th) num := 0 //小计 for _, datacell := range self.DockerQueue.Dockers[dataIndex] { if datacell["RuleName"].(string) == Name { row := []string{} for _, title := range Rule.GetOutFeild() { vd := datacell["Data"].(map[string]interface{}) if v, ok := vd[title].(string); ok || vd[title] == nil { row = append(row, v) } else { row = append(row, util.JsonString(vd[title])) } } row = append(row, datacell["Url"].(string)) row = append(row, datacell["ParentUrl"].(string)) row = append(row, datacell["DownloadTime"].(string)) w.Write(row) num++ } } // 发送缓存数据流 w.Flush() // 关闭文件 file.Close() // 输出报告 // Log.Printf("[任务:%v | 关键词:%v | 小类:%v] 输出 %v 条数据!!!\n", self.Spider.GetName(), self.Spider.GetKeyword(), Name, num) } } }
func init() { defer func() { // 获取输出方式列表 for out, _ := range Output { OutputLib = append(OutputLib, out) } util.StringsSort(OutputLib) }() /************************ excel 输出 ***************************/ Output["excel"] = func(self *Collector, dataIndex int) { defer func() { if err := recover(); err != nil { Log.Println(err) } }() var file *xlsx.File var sheet *xlsx.Sheet var row *xlsx.Row var cell *xlsx.Cell var err error folder1 := "result/data" folder2 := folder1 + "/" + self.startTime.Format("2006年01月02日 15时04分05秒") filename := folder2 + "/" + util.FileNameReplace(self.Spider.GetName()+"_"+self.Spider.GetKeyword()+" "+fmt.Sprintf("%v", self.sum[0])+"-"+fmt.Sprintf("%v", self.sum[1])) + ".xlsx" // 创建文件 file = xlsx.NewFile() // 添加分类数据工作表 for Name, Rule := range self.GetRules() { // 跳过不输出的数据 if len(Rule.GetOutFeild()) == 0 { continue } // 添加工作表 sheet = file.AddSheet(util.ExcelSheetNameReplace(Name)) // 写入表头 row = sheet.AddRow() for _, title := range Rule.GetOutFeild() { cell = row.AddCell() cell.Value = title } cell = row.AddCell() cell.Value = "当前链接" cell = row.AddCell() cell.Value = "上级链接" cell = row.AddCell() cell.Value = "下载时间" num := 0 //小计 for _, datacell := range self.DockerQueue.Dockers[dataIndex] { if datacell["RuleName"].(string) == Name { row = sheet.AddRow() for _, title := range Rule.GetOutFeild() { cell = row.AddCell() vd := datacell["Data"].(map[string]interface{}) if v, ok := vd[title].(string); ok || vd[title] == nil { cell.Value = v } else { cell.Value = util.JsonString(vd[title]) } } cell = row.AddCell() cell.Value = datacell["Url"].(string) cell = row.AddCell() cell.Value = datacell["ParentUrl"].(string) cell = row.AddCell() cell.Value = datacell["DownloadTime"].(string) num++ } } // Log.Printf("[任务:%v | 关键词:%v | 小类:%v] 输出 %v 条数据!!!\n", self.Spider.GetName(), self.Spider.GetKeyword(), Name, num) } // 创建/打开目录 f2, err := os.Stat(folder2) if err != nil || !f2.IsDir() { if err := os.MkdirAll(folder2, 0777); err != nil { Log.Printf("Error: %v\n", err) } } // 保存文件 err = file.Save(filename) if err != nil { Log.Println(err) } } /************************ CSV 输出 ***************************/ Output["csv"] = func(self *Collector, dataIndex int) { defer func() { if err := recover(); err != nil { Log.Println(err) } }() folder1 := "result/data" folder2 := folder1 + "/" + self.startTime.Format("2006年01月02日 15时04分05秒") filenameBase := folder2 + "/" + util.FileNameReplace(self.Spider.GetName()+"_"+self.Spider.GetKeyword()+" "+fmt.Sprintf("%v", self.sum[0])+"-"+fmt.Sprintf("%v", self.sum[1])) // 创建/打开目录 f2, err := os.Stat(folder2) if err != nil || !f2.IsDir() { if err := os.MkdirAll(folder2, 0777); err != nil { Log.Printf("Error: %v\n", err) } } // 按数据分类创建文件 for Name, Rule := range self.GetRules() { // 跳过不输出的数据 if len(Rule.GetOutFeild()) == 0 { continue } file, err := os.Create(filenameBase + " (" + util.FileNameReplace(Name) + ").csv") if err != nil { Log.Println(err) continue } file.WriteString("\xEF\xBB\xBF") // 写入UTF-8 BOM w := csv.NewWriter(file) th := Rule.GetOutFeild() th = append(th, []string{"当前链接", "上级链接", "下载时间"}...) w.Write(th) num := 0 //小计 for _, datacell := range self.DockerQueue.Dockers[dataIndex] { if datacell["RuleName"].(string) == Name { row := []string{} for _, title := range Rule.GetOutFeild() { vd := datacell["Data"].(map[string]interface{}) if v, ok := vd[title].(string); ok || vd[title] == nil { row = append(row, v) } else { row = append(row, util.JsonString(vd[title])) } } row = append(row, datacell["Url"].(string)) row = append(row, datacell["ParentUrl"].(string)) row = append(row, datacell["DownloadTime"].(string)) w.Write(row) num++ } } // 发送缓存数据流 w.Flush() // 关闭文件 file.Close() // 输出报告 // Log.Printf("[任务:%v | 关键词:%v | 小类:%v] 输出 %v 条数据!!!\n", self.Spider.GetName(), self.Spider.GetKeyword(), Name, num) } } /************************ MongoDB 输出 ***************************/ Output["mgo"] = func(self *Collector, dataIndex int) { session, err := mgo.Dial(config.DB_URL) //连接数据库 if err != nil { panic(err) } defer session.Close() session.SetMode(mgo.Monotonic, true) db := session.DB(config.DB_NAME) //数据库名称 collection := db.C(config.DB_COLLECTION) //如果该集合已经存在的话,则直接返回 for i, count := 0, len(self.DockerQueue.Dockers[dataIndex]); i < count; i++ { err = collection.Insert((interface{})(self.DockerQueue.Dockers[dataIndex][i])) if err != nil { panic(err) } } } /************************ HBase 输出 ***************************/ var master = cache.Task.Master var port = ":" + fmt.Sprintf("%v", cache.Task.Port) var hbaseSocket = teleport.New().SetPackHeader("tentinet") var hbaseOnce sync.Once Output["hbase"] = func(self *Collector, dataIndex int) { hbaseOnce.Do(func() { hbaseSocket.Client(master, port) }) for i, count := 0, len(self.DockerQueue.Dockers[dataIndex]); i < count; i++ { hbaseSocket.Request(self.DockerQueue.Dockers[dataIndex][i], "log") } } /************************ Mysql 输出 ***************************/ Output["mysql"] = func(self *Collector, dataIndex int) { db, err := sql.Open("mysql", config.MYSQL_USER+":"+config.MYSQL_PW+"@tcp("+config.MYSQL_HOST+")/"+config.MYSQL_DB+"?charset=utf8") if err != nil { fmt.Println(err) } defer db.Close() var newMysql = new(myTable) for Name, Rule := range self.GetRules() { //跳过不输出的数据 if len(Rule.GetOutFeild()) == 0 { continue } newMysql.setTableName("`" + self.Spider.GetName() + "-" + Name + "-" + self.Spider.GetKeyword() + "`") for _, title := range Rule.GetOutFeild() { newMysql.addColumn(title) } newMysql.addColumn("当前连接", "上级链接", "下载时间"). create(db) num := 0 //小计 for _, datacell := range self.DockerQueue.Dockers[dataIndex] { if datacell["RuleName"].(string) == Name { for _, title := range Rule.GetOutFeild() { vd := datacell["Data"].(map[string]interface{}) if v, ok := vd[title].(string); ok || vd[title] == nil { newMysql.addRow(v) } else { newMysql.addRow(util.JsonString(vd[title])) } } newMysql.addRow(datacell["Url"].(string), datacell["ParentUrl"].(string), datacell["DownloadTime"].(string)). update(db) num++ } } newMysql = new(myTable) } } }
/************************ excel 输出 ***************************/ func init() { Output["excel"] = func(self *Collector, dataIndex int) (err error) { defer func() { if p := recover(); p != nil { err = fmt.Errorf("%v", p) } }() var ( file *xlsx.File row *xlsx.Row cell *xlsx.Cell sheets = make(map[string]*xlsx.Sheet) ) // 创建文件 file = xlsx.NewFile() // 添加分类数据工作表 for _, datacell := range self.DockerQueue.Dockers[dataIndex] { var subNamespace = util.FileNameReplace(self.subNamespace(datacell)) if _, ok := sheets[subNamespace]; !ok { // 添加工作表 sheet, err := file.AddSheet(subNamespace) if err != nil { logs.Log.Error("%v", err) continue } sheets[subNamespace] = sheet // 写入表头 row = sheets[subNamespace].AddRow() for _, title := range self.MustGetRule(datacell["RuleName"].(string)).ItemFields { row.AddCell().Value = title } if self.Spider.OutDefaultField() { row.AddCell().Value = "当前链接" row.AddCell().Value = "上级链接" row.AddCell().Value = "下载时间" } } row = sheets[subNamespace].AddRow() for _, title := range self.MustGetRule(datacell["RuleName"].(string)).ItemFields { cell = row.AddCell() vd := datacell["Data"].(map[string]interface{}) if v, ok := vd[title].(string); ok || vd[title] == nil { cell.Value = v } else { cell.Value = util.JsonString(vd[title]) } } if self.Spider.OutDefaultField() { row.AddCell().Value = datacell["Url"].(string) row.AddCell().Value = datacell["ParentUrl"].(string) row.AddCell().Value = datacell["DownloadTime"].(string) } } folder := config.TEXT_DIR + "/" + cache.StartTime.Format("2006年01月02日 15时04分05秒") filename := fmt.Sprintf("%v/%v__%v-%v.xlsx", folder, util.FileNameReplace(self.namespace()), self.sum[0], self.sum[1]) // 创建/打开目录 f2, err := os.Stat(folder) if err != nil || !f2.IsDir() { if err := os.MkdirAll(folder, 0777); err != nil { logs.Log.Error("Error: %v\n", err) } } // 保存文件 err = file.Save(filename) return } }
/************************ Kafka 输出 ***************************/ func init() { var ( kafkaSenders = map[string]*kafka.KafkaSender{} kafkaSenderLock sync.RWMutex ) var getKafkaSender = func(name string) (*kafka.KafkaSender, bool) { kafkaSenderLock.RLock() tab, ok := kafkaSenders[name] kafkaSenderLock.RUnlock() return tab, ok } var setKafkaSender = func(name string, tab *kafka.KafkaSender) { kafkaSenderLock.Lock() kafkaSenders[name] = tab kafkaSenderLock.Unlock() } DataOutput["kafka"] = func(self *Collector) error { _, err := kafka.GetProducer() if err != nil { return fmt.Errorf("kafka producer失败: %v", err) } var ( kafkas = make(map[string]*kafka.KafkaSender) namespace = util.FileNameReplace(self.namespace()) ) for _, datacell := range self.dataDocker { subNamespace := util.FileNameReplace(self.subNamespace(datacell)) topicName := joinNamespaces(namespace, subNamespace) sender, ok := kafkas[topicName] if !ok { sender, ok = getKafkaSender(topicName) if ok { kafkas[topicName] = sender } else { sender = kafka.New() sender.SetTopic(topicName) setKafkaSender(topicName, sender) kafkas[topicName] = sender } } data := make(map[string]interface{}) for _, title := range self.MustGetRule(datacell["RuleName"].(string)).ItemFields { vd := datacell["Data"].(map[string]interface{}) if v, ok := vd[title].(string); ok || vd[title] == nil { data[title] = v } else { data[title] = util.JsonString(vd[title]) } } if self.Spider.OutDefaultField() { data["url"] = datacell["Url"].(string) data["parent_url"] = datacell["ParentUrl"].(string) data["download_time"] = datacell["DownloadTime"].(string) } err := sender.Push(data) util.CheckErr(err) } kafkas = nil return nil } }