func init() { Output["mgo"] = func(self *Collector, dataIndex int) { var err error //连接数据库 mgoSession := mgo.MgoPool.GetOne().(*mgo.MgoSrc) defer mgo.MgoPool.Free(mgoSession) var db = mgoSession.DB(config.MGO.DB) var namespace = util.FileNameReplace(self.namespace()) var collections = make(map[string]*mgov2.Collection) var dataMap = make(map[string][]interface{}) for _, datacell := range self.DockerQueue.Dockers[dataIndex] { subNamespace := util.FileNameReplace(self.subNamespace(datacell)) if _, ok := collections[subNamespace]; !ok { collections[subNamespace] = db.C(namespace + "__" + subNamespace) } for k, v := range datacell["Data"].(map[string]interface{}) { datacell[k] = v } delete(datacell, "Data") delete(datacell, "RuleName") dataMap[subNamespace] = append(dataMap[subNamespace], datacell) } for k, v := range dataMap { err = collections[k].Insert(v...) if err != nil { logs.Log.Error("%v", err) } } } }
func New(name string, subName string) Historier { successTabName := SUCCESS_SUFFIX + "__" + name successFileName := SUCCESS_FILE + "__" + name failureTabName := FAILURE_SUFFIX + "__" + name failureFileName := FAILURE_FILE + "__" + name if subName != "" { successTabName += "__" + subName successFileName += "__" + subName failureTabName += "__" + subName failureFileName += "__" + subName } return &History{ Success: &Success{ tabName: util.FileNameReplace(successTabName), fileName: successFileName, new: make(map[string]bool), old: make(map[string]bool), }, Failure: &Failure{ tabName: util.FileNameReplace(failureTabName), fileName: failureFileName, list: make(map[string]*request.Request), }, } }
func init() { Output["mgo"] = func(self *Collector, dataIndex int) error { //连接数据库 if mgo.Error() != nil { return fmt.Errorf("MongoBD数据库链接失败: %v", mgo.Error()) } return mgo.Call(func(src pool.Src) error { var ( db = src.(*mgo.MgoSrc).DB(config.DB_NAME) namespace = util.FileNameReplace(self.namespace()) collections = make(map[string]*mgov2.Collection) dataMap = make(map[string][]interface{}) err error ) for _, datacell := range self.DockerQueue.Dockers[dataIndex] { subNamespace := util.FileNameReplace(self.subNamespace(datacell)) var cName = namespace if subNamespace != "" { cName += "__" + subNamespace } if _, ok := collections[subNamespace]; !ok { collections[subNamespace] = db.C(cName) } for k, v := range datacell["Data"].(map[string]interface{}) { datacell[k] = v } delete(datacell, "Data") delete(datacell, "RuleName") if !self.Spider.OutDefaultField() { delete(datacell, "Url") delete(datacell, "ParentUrl") delete(datacell, "DownloadTime") } dataMap[subNamespace] = append(dataMap[subNamespace], datacell) } for collection, docs := range dataMap { c := collections[collection] count := len(docs) loop := count / mgo.MaxLen for i := 0; i < loop; i++ { err = c.Insert(docs[i*mgo.MaxLen : (i+1)*mgo.MaxLen]...) if err != nil { logs.Log.Error("%v", err) } } if count%mgo.MaxLen == 0 { continue } err = c.Insert(docs[loop*mgo.MaxLen:]...) if err != nil { logs.Log.Error("%v", err) } } return nil }) } }
// 文件输出 func (self *Collector) outputFile(file data.FileCell) { // 复用FileCell defer func() { data.PutFileCell(file) self.wait.Done() }() // 路径: file/"RuleName"/"time"/"Name" p, n := filepath.Split(filepath.Clean(file["Name"].(string))) // dir := filepath.Join(config.FILE_DIR, util.FileNameReplace(self.namespace())+"__"+cache.StartTime.Format("2006年01月02日 15时04分05秒"), p) dir := filepath.Join(config.FILE_DIR, util.FileNameReplace(self.namespace()), p) // 文件名 fileName := filepath.Join(dir, util.FileNameReplace(n)) // 创建/打开目录 d, err := os.Stat(dir) if err != nil || !d.IsDir() { if err := os.MkdirAll(dir, 0777); err != nil { logs.Log.Error( " * Fail [文件下载:%v | KEYIN:%v | 批次:%v] %v [ERROR] %v\n", self.Spider.GetName(), self.Spider.GetKeyin(), atomic.LoadUint64(&self.fileBatch), fileName, err, ) return } } // 文件不存在就以0777的权限创建文件,如果存在就在写入之前清空内容 f, err := os.OpenFile(fileName, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0777) if err != nil { logs.Log.Error( " * Fail [文件下载:%v | KEYIN:%v | 批次:%v] %v [ERROR] %v\n", self.Spider.GetName(), self.Spider.GetKeyin(), atomic.LoadUint64(&self.fileBatch), fileName, err, ) return } size, err := io.Copy(f, bytes.NewReader(file["Bytes"].([]byte))) f.Close() if err != nil { logs.Log.Error( " * Fail [文件下载:%v | KEYIN:%v | 批次:%v] %v (%s) [ERROR] %v\n", self.Spider.GetName(), self.Spider.GetKeyin(), atomic.LoadUint64(&self.fileBatch), fileName, bytesSize.Format(uint64(size)), err, ) return } // 输出统计 self.addFileSum(1) // 打印报告 logs.Log.Informational(" * ") logs.Log.App( " * [文件下载:%v | KEYIN:%v | 批次:%v] %v (%s)\n", self.Spider.GetName(), self.Spider.GetKeyin(), atomic.LoadUint64(&self.fileBatch), fileName, bytesSize.Format(uint64(size)), ) logs.Log.Informational(" * ") }
func init() { Output["mysql"] = func(self *Collector, dataIndex int) { db, ok := mysql.MysqlPool.GetOne().(*mysql.MysqlSrc) if !ok || db == nil { logs.Log.Error("链接Mysql数据库超时,无法输出!") return } defer mysql.MysqlPool.Free(db) var mysqls = make(map[string]*mysql.MyTable) var namespace = util.FileNameReplace(self.namespace()) for _, datacell := range self.DockerQueue.Dockers[dataIndex] { subNamespace := util.FileNameReplace(self.subNamespace(datacell)) var tName = namespace if subNamespace != "" { tName += "__" + subNamespace } if _, ok := mysqls[subNamespace]; !ok { mysqls[subNamespace] = mysql.New(db.DB) mysqls[subNamespace].SetTableName(tName) for _, title := range self.MustGetRule(datacell["RuleName"].(string)).ItemFields { mysqls[subNamespace].AddColumn(title + ` MEDIUMTEXT`) } mysqls[subNamespace]. AddColumn(`Url VARCHAR(255)`, `ParentUrl VARCHAR(255)`, `DownloadTime VARCHAR(50)`). Create() } for _, title := range self.MustGetRule(datacell["RuleName"].(string)).ItemFields { vd := datacell["Data"].(map[string]interface{}) if v, ok := vd[title].(string); ok || vd[title] == nil { mysqls[subNamespace].AddRow(v) } else { mysqls[subNamespace].AddRow(util.JsonString(vd[title])) } } err := mysqls[subNamespace]. AddRow(datacell["Url"].(string), datacell["ParentUrl"].(string), datacell["DownloadTime"].(string)). Update() util.CheckErr(err) } } }
//文件输出管理 func (self *Collector) SaveFile() { for !(self.CtrlLen() == 0 && len(self.FileChan) == 0) { select { case file := <-self.FileChan: self.outCount[2]++ // 路径: file/"RuleName"/"time"/"Name" p, n := filepath.Split(filepath.Clean(file["Name"].(string))) // dir := filepath.Join(config.FILE_DIR, util.FileNameReplace(self.namespace())+"__"+cache.StartTime.Format("2006年01月02日 15时04分05秒"), p) dir := filepath.Join(config.FILE_DIR, util.FileNameReplace(self.namespace()), p) // 创建/打开目录 d, err := os.Stat(dir) if err != nil || !d.IsDir() { if err := os.MkdirAll(dir, 0777); err != nil { logs.Log.Error("Error: %v\n", err) } } // 输出统计 self.addFileSum(1) // 文件不存在就以0777的权限创建文件,如果存在就在写入之前清空内容 fileName := filepath.Join(dir, util.FileNameReplace(n)) f, _ := os.OpenFile(fileName, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0777) size, _ := io.Copy(f, file["Body"].(io.ReadCloser)) f.Close() file["Body"].(io.ReadCloser).Close() // 打印报告 logs.Log.Informational(" * ") logs.Log.App(" * [任务:%v | KEYIN:%v] 成功下载文件: %v (%s)\n", self.Spider.GetName(), self.Spider.GetKeyin(), fileName, bytes.Format(uint64(size))) logs.Log.Informational(" * ") self.outCount[3]++ // 复用FileCell data.PutFileCell(file) default: runtime.Gosched() } } }
func init() { Output["mysql"] = func(self *Collector, dataIndex int) { db, ok := mysql.MysqlPool.GetOne().(*mysql.MysqlSrc) if !ok || db == nil { logs.Log.Error("链接Mysql数据库超时,无法输出!") return } defer mysql.MysqlPool.Free(db) var mysqls = make(map[string]*mysql.MyTable) var namespace = util.FileNameReplace(self.namespace()) for _, datacell := range self.DockerQueue.Dockers[dataIndex] { subNamespace := util.FileNameReplace(self.subNamespace(datacell)) if _, ok := mysqls[subNamespace]; !ok { mysqls[subNamespace] = mysql.New(db.DB) mysqls[subNamespace].SetTableName("`" + namespace + "__" + subNamespace + "`") for _, title := range self.GetRule(datacell["RuleName"].(string)).GetOutFeild() { mysqls[subNamespace].AddColumn(title) } mysqls[subNamespace]. AddColumn("Url", "ParentUrl", "DownloadTime"). Create() } for _, title := range self.GetRule(datacell["RuleName"].(string)).GetOutFeild() { vd := datacell["Data"].(map[string]interface{}) if v, ok := vd[title].(string); ok || vd[title] == nil { mysqls[subNamespace].AddRow(v) } else { mysqls[subNamespace].AddRow(util.JsonString(vd[title])) } } mysqls[subNamespace]. AddRow(datacell["Url"].(string), datacell["ParentUrl"].(string), datacell["DownloadTime"].(string)). Update() } } }
//文件输出管理 func (self *Collector) SaveFile() { for !(self.CtrlLen() == 0 && len(self.FileChan) == 0) { select { case file := <-self.FileChan: self.outCount[2]++ // 统计输出文件数 self.setFileSum(1) // 路径: file/"RuleName"/"time"/"Name" p, n := path.Split(file["Name"].(string)) dir := config.COMM_PATH.FILE + `/` + util.FileNameReplace(self.namespace()) + "__" + cache.StartTime.Format("2006年01月02日 15时04分05秒") + `/` + p // 创建/打开目录 d, err := os.Stat(dir) if err != nil || !d.IsDir() { if err := os.MkdirAll(dir, 0777); err != nil { logs.Log.Error("Error: %v\n", err) } } // 创建文件 fileName := dir + util.FileNameReplace(n) f, _ := os.Create(fileName) io.Copy(f, file["Body"].(io.ReadCloser)) f.Close() file["Body"].(io.ReadCloser).Close() // 打印报告 logs.Log.Informational(" * ") logs.Log.Notice(" * [任务:%v | 关键词:%v] 成功下载文件: %v \n", self.Spider.GetName(), self.Spider.GetKeyword(), fileName) logs.Log.Informational(" * ") self.outCount[3]++ default: runtime.Gosched() } } }
/************************ excel 输出 ***************************/ func init() { Output["excel"] = func(self *Collector, dataIndex int) { defer func() { if err := recover(); err != nil { Log.Println(err) } }() var file *xlsx.File var sheet *xlsx.Sheet var row *xlsx.Row var cell *xlsx.Cell var err error folder1 := "result/data" folder2 := folder1 + "/" + self.startTime.Format("2006年01月02日 15时04分05秒") filename := folder2 + "/" + util.FileNameReplace(self.Spider.GetName()+"_"+self.Spider.GetKeyword()+" "+fmt.Sprintf("%v", self.sum[0])+"-"+fmt.Sprintf("%v", self.sum[1])) + ".xlsx" // 创建文件 file = xlsx.NewFile() // 添加分类数据工作表 for Name, Rule := range self.GetRules() { // 跳过不输出的数据 if len(Rule.GetOutFeild()) == 0 { continue } // 添加工作表 sheet = file.AddSheet(util.ExcelSheetNameReplace(Name)) // 写入表头 row = sheet.AddRow() for _, title := range Rule.GetOutFeild() { cell = row.AddCell() cell.Value = title } cell = row.AddCell() cell.Value = "当前链接" cell = row.AddCell() cell.Value = "上级链接" cell = row.AddCell() cell.Value = "下载时间" num := 0 //小计 for _, datacell := range self.DockerQueue.Dockers[dataIndex] { if datacell["RuleName"].(string) == Name { row = sheet.AddRow() for _, title := range Rule.GetOutFeild() { cell = row.AddCell() vd := datacell["Data"].(map[string]interface{}) if v, ok := vd[title].(string); ok || vd[title] == nil { cell.Value = v } else { cell.Value = util.JsonString(vd[title]) } } cell = row.AddCell() cell.Value = datacell["Url"].(string) cell = row.AddCell() cell.Value = datacell["ParentUrl"].(string) cell = row.AddCell() cell.Value = datacell["DownloadTime"].(string) num++ } } // Log.Printf("[任务:%v | 关键词:%v | 小类:%v] 输出 %v 条数据!!!\n", self.Spider.GetName(), self.Spider.GetKeyword(), Name, num) } // 创建/打开目录 f2, err := os.Stat(folder2) if err != nil || !f2.IsDir() { if err := os.MkdirAll(folder2, 0777); err != nil { Log.Printf("Error: %v\n", err) } } // 保存文件 err = file.Save(filename) if err != nil { Log.Println(err) } } }
func init() { var ( mysqlTable = map[string]*mysql.MyTable{} mysqlTableLock sync.RWMutex ) var getMysqlTable = func(name string) (*mysql.MyTable, bool) { mysqlTableLock.RLock() defer mysqlTableLock.RUnlock() tab, ok := mysqlTable[name] if ok { return tab.Clone(), true } return nil, false } var setMysqlTable = func(name string, tab *mysql.MyTable) { mysqlTableLock.Lock() mysqlTable[name] = tab mysqlTableLock.Unlock() } DataOutput["mysql"] = func(self *Collector) error { _, err := mysql.DB() if err != nil { return fmt.Errorf("Mysql数据库链接失败: %v", err) } var ( mysqls = make(map[string]*mysql.MyTable) namespace = util.FileNameReplace(self.namespace()) ) for _, datacell := range self.dataDocker { subNamespace := util.FileNameReplace(self.subNamespace(datacell)) tName := joinNamespaces(namespace, subNamespace) table, ok := mysqls[tName] if !ok { table, ok = getMysqlTable(tName) if ok { mysqls[tName] = table } else { table = mysql.New() table.SetTableName(tName) for _, title := range self.MustGetRule(datacell["RuleName"].(string)).ItemFields { table.AddColumn(title + ` MEDIUMTEXT`) } if self.Spider.OutDefaultField() { table.AddColumn(`Url VARCHAR(255)`, `ParentUrl VARCHAR(255)`, `DownloadTime VARCHAR(50)`) } if err := table.Create(); err != nil { logs.Log.Error("%v", err) continue } else { setMysqlTable(tName, table) mysqls[tName] = table } } } data := []string{} for _, title := range self.MustGetRule(datacell["RuleName"].(string)).ItemFields { vd := datacell["Data"].(map[string]interface{}) if v, ok := vd[title].(string); ok || vd[title] == nil { data = append(data, v) } else { data = append(data, util.JsonString(vd[title])) } } if self.Spider.OutDefaultField() { data = append(data, datacell["Url"].(string), datacell["ParentUrl"].(string), datacell["DownloadTime"].(string)) } table.AutoInsert(data) } for _, tab := range mysqls { util.CheckErr(tab.FlushInsert()) } mysqls = nil return nil } }
/************************ CSV 输出 ***************************/ func init() { Output["csv"] = func(self *Collector, dataIndex int) { defer func() { if err := recover(); err != nil { logs.Log.Error("%v", err) } }() var namespace = util.FileNameReplace(self.namespace()) var sheets = make(map[string]*csv.Writer) for _, datacell := range self.DockerQueue.Dockers[dataIndex] { var subNamespace = util.FileNameReplace(self.subNamespace(datacell)) if _, ok := sheets[subNamespace]; !ok { folder := config.COMM_PATH.TEXT + "/" + cache.StartTime.Format("2006年01月02日 15时04分05秒") + "/" + namespace + "__" + subNamespace filename := fmt.Sprintf("%v/%v-%v.csv", folder, self.sum[0], self.sum[1]) // 创建/打开目录 f, err := os.Stat(folder) if err != nil || !f.IsDir() { if err := os.MkdirAll(folder, 0777); err != nil { logs.Log.Error("Error: %v\n", err) } } // 按数据分类创建文件 file, err := os.Create(filename) if err != nil { logs.Log.Error("%v", err) continue } file.WriteString("\xEF\xBB\xBF") // 写入UTF-8 BOM sheets[subNamespace] = csv.NewWriter(file) th := self.MustGetRule(datacell["RuleName"].(string)).ItemFields th = append(th, "当前链接", "上级链接", "下载时间") sheets[subNamespace].Write(th) defer func(file *os.File) { // 发送缓存数据流 sheets[subNamespace].Flush() // 关闭文件 file.Close() }(file) } row := []string{} for _, title := range self.MustGetRule(datacell["RuleName"].(string)).ItemFields { vd := datacell["Data"].(map[string]interface{}) if v, ok := vd[title].(string); ok || vd[title] == nil { row = append(row, v) } else { row = append(row, util.JsonString(vd[title])) } } row = append(row, datacell["Url"].(string)) row = append(row, datacell["ParentUrl"].(string)) row = append(row, datacell["DownloadTime"].(string)) sheets[subNamespace].Write(row) } } }
/************************ CSV 输出 ***************************/ func init() { Output["csv"] = func(self *Collector, dataIndex int) { defer func() { if err := recover(); err != nil { Log.Println(err) } }() folder1 := "result/data" folder2 := folder1 + "/" + self.startTime.Format("2006年01月02日 15时04分05秒") filenameBase := folder2 + "/" + util.FileNameReplace(self.Spider.GetName()+"_"+self.Spider.GetKeyword()+" "+fmt.Sprintf("%v", self.sum[0])+"-"+fmt.Sprintf("%v", self.sum[1])) // 创建/打开目录 f2, err := os.Stat(folder2) if err != nil || !f2.IsDir() { if err := os.MkdirAll(folder2, 0777); err != nil { Log.Printf("Error: %v\n", err) } } // 按数据分类创建文件 for Name, Rule := range self.GetRules() { // 跳过不输出的数据 if len(Rule.GetOutFeild()) == 0 { continue } file, err := os.Create(filenameBase + " (" + util.FileNameReplace(Name) + ").csv") if err != nil { Log.Println(err) continue } file.WriteString("\xEF\xBB\xBF") // 写入UTF-8 BOM w := csv.NewWriter(file) th := Rule.GetOutFeild() th = append(th, []string{"当前链接", "上级链接", "下载时间"}...) w.Write(th) num := 0 //小计 for _, datacell := range self.DockerQueue.Dockers[dataIndex] { if datacell["RuleName"].(string) == Name { row := []string{} for _, title := range Rule.GetOutFeild() { vd := datacell["Data"].(map[string]interface{}) if v, ok := vd[title].(string); ok || vd[title] == nil { row = append(row, v) } else { row = append(row, util.JsonString(vd[title])) } } row = append(row, datacell["Url"].(string)) row = append(row, datacell["ParentUrl"].(string)) row = append(row, datacell["DownloadTime"].(string)) w.Write(row) num++ } } // 发送缓存数据流 w.Flush() // 关闭文件 file.Close() // 输出报告 // Log.Printf("[任务:%v | 关键词:%v | 小类:%v] 输出 %v 条数据!!!\n", self.Spider.GetName(), self.Spider.GetKeyword(), Name, num) } } }
func init() { defer func() { // 获取输出方式列表 for out, _ := range Output { OutputLib = append(OutputLib, out) } util.StringsSort(OutputLib) }() /************************ excel 输出 ***************************/ Output["excel"] = func(self *Collector, dataIndex int) { defer func() { if err := recover(); err != nil { Log.Println(err) } }() var file *xlsx.File var sheet *xlsx.Sheet var row *xlsx.Row var cell *xlsx.Cell var err error folder1 := "result/data" folder2 := folder1 + "/" + self.startTime.Format("2006年01月02日 15时04分05秒") filename := folder2 + "/" + util.FileNameReplace(self.Spider.GetName()+"_"+self.Spider.GetKeyword()+" "+fmt.Sprintf("%v", self.sum[0])+"-"+fmt.Sprintf("%v", self.sum[1])) + ".xlsx" // 创建文件 file = xlsx.NewFile() // 添加分类数据工作表 for Name, Rule := range self.GetRules() { // 跳过不输出的数据 if len(Rule.GetOutFeild()) == 0 { continue } // 添加工作表 sheet = file.AddSheet(util.ExcelSheetNameReplace(Name)) // 写入表头 row = sheet.AddRow() for _, title := range Rule.GetOutFeild() { cell = row.AddCell() cell.Value = title } cell = row.AddCell() cell.Value = "当前链接" cell = row.AddCell() cell.Value = "上级链接" cell = row.AddCell() cell.Value = "下载时间" num := 0 //小计 for _, datacell := range self.DockerQueue.Dockers[dataIndex] { if datacell["RuleName"].(string) == Name { row = sheet.AddRow() for _, title := range Rule.GetOutFeild() { cell = row.AddCell() vd := datacell["Data"].(map[string]interface{}) if v, ok := vd[title].(string); ok || vd[title] == nil { cell.Value = v } else { cell.Value = util.JsonString(vd[title]) } } cell = row.AddCell() cell.Value = datacell["Url"].(string) cell = row.AddCell() cell.Value = datacell["ParentUrl"].(string) cell = row.AddCell() cell.Value = datacell["DownloadTime"].(string) num++ } } // Log.Printf("[任务:%v | 关键词:%v | 小类:%v] 输出 %v 条数据!!!\n", self.Spider.GetName(), self.Spider.GetKeyword(), Name, num) } // 创建/打开目录 f2, err := os.Stat(folder2) if err != nil || !f2.IsDir() { if err := os.MkdirAll(folder2, 0777); err != nil { Log.Printf("Error: %v\n", err) } } // 保存文件 err = file.Save(filename) if err != nil { Log.Println(err) } } /************************ CSV 输出 ***************************/ Output["csv"] = func(self *Collector, dataIndex int) { defer func() { if err := recover(); err != nil { Log.Println(err) } }() folder1 := "result/data" folder2 := folder1 + "/" + self.startTime.Format("2006年01月02日 15时04分05秒") filenameBase := folder2 + "/" + util.FileNameReplace(self.Spider.GetName()+"_"+self.Spider.GetKeyword()+" "+fmt.Sprintf("%v", self.sum[0])+"-"+fmt.Sprintf("%v", self.sum[1])) // 创建/打开目录 f2, err := os.Stat(folder2) if err != nil || !f2.IsDir() { if err := os.MkdirAll(folder2, 0777); err != nil { Log.Printf("Error: %v\n", err) } } // 按数据分类创建文件 for Name, Rule := range self.GetRules() { // 跳过不输出的数据 if len(Rule.GetOutFeild()) == 0 { continue } file, err := os.Create(filenameBase + " (" + util.FileNameReplace(Name) + ").csv") if err != nil { Log.Println(err) continue } file.WriteString("\xEF\xBB\xBF") // 写入UTF-8 BOM w := csv.NewWriter(file) th := Rule.GetOutFeild() th = append(th, []string{"当前链接", "上级链接", "下载时间"}...) w.Write(th) num := 0 //小计 for _, datacell := range self.DockerQueue.Dockers[dataIndex] { if datacell["RuleName"].(string) == Name { row := []string{} for _, title := range Rule.GetOutFeild() { vd := datacell["Data"].(map[string]interface{}) if v, ok := vd[title].(string); ok || vd[title] == nil { row = append(row, v) } else { row = append(row, util.JsonString(vd[title])) } } row = append(row, datacell["Url"].(string)) row = append(row, datacell["ParentUrl"].(string)) row = append(row, datacell["DownloadTime"].(string)) w.Write(row) num++ } } // 发送缓存数据流 w.Flush() // 关闭文件 file.Close() // 输出报告 // Log.Printf("[任务:%v | 关键词:%v | 小类:%v] 输出 %v 条数据!!!\n", self.Spider.GetName(), self.Spider.GetKeyword(), Name, num) } } /************************ MongoDB 输出 ***************************/ Output["mgo"] = func(self *Collector, dataIndex int) { session, err := mgo.Dial(config.DB_URL) //连接数据库 if err != nil { panic(err) } defer session.Close() session.SetMode(mgo.Monotonic, true) db := session.DB(config.DB_NAME) //数据库名称 collection := db.C(config.DB_COLLECTION) //如果该集合已经存在的话,则直接返回 for i, count := 0, len(self.DockerQueue.Dockers[dataIndex]); i < count; i++ { err = collection.Insert((interface{})(self.DockerQueue.Dockers[dataIndex][i])) if err != nil { panic(err) } } } /************************ HBase 输出 ***************************/ var master = cache.Task.Master var port = ":" + fmt.Sprintf("%v", cache.Task.Port) var hbaseSocket = teleport.New().SetPackHeader("tentinet") var hbaseOnce sync.Once Output["hbase"] = func(self *Collector, dataIndex int) { hbaseOnce.Do(func() { hbaseSocket.Client(master, port) }) for i, count := 0, len(self.DockerQueue.Dockers[dataIndex]); i < count; i++ { hbaseSocket.Request(self.DockerQueue.Dockers[dataIndex][i], "log") } } /************************ Mysql 输出 ***************************/ Output["mysql"] = func(self *Collector, dataIndex int) { db, err := sql.Open("mysql", config.MYSQL_USER+":"+config.MYSQL_PW+"@tcp("+config.MYSQL_HOST+")/"+config.MYSQL_DB+"?charset=utf8") if err != nil { fmt.Println(err) } defer db.Close() var newMysql = new(myTable) for Name, Rule := range self.GetRules() { //跳过不输出的数据 if len(Rule.GetOutFeild()) == 0 { continue } newMysql.setTableName("`" + self.Spider.GetName() + "-" + Name + "-" + self.Spider.GetKeyword() + "`") for _, title := range Rule.GetOutFeild() { newMysql.addColumn(title) } newMysql.addColumn("当前连接", "上级链接", "下载时间"). create(db) num := 0 //小计 for _, datacell := range self.DockerQueue.Dockers[dataIndex] { if datacell["RuleName"].(string) == Name { for _, title := range Rule.GetOutFeild() { vd := datacell["Data"].(map[string]interface{}) if v, ok := vd[title].(string); ok || vd[title] == nil { newMysql.addRow(v) } else { newMysql.addRow(util.JsonString(vd[title])) } } newMysql.addRow(datacell["Url"].(string), datacell["ParentUrl"].(string), datacell["DownloadTime"].(string)). update(db) num++ } } newMysql = new(myTable) } } }
/************************ excel 输出 ***************************/ func init() { Output["excel"] = func(self *Collector, dataIndex int) (err error) { defer func() { if p := recover(); p != nil { err = fmt.Errorf("%v", p) } }() var ( file *xlsx.File row *xlsx.Row cell *xlsx.Cell sheets = make(map[string]*xlsx.Sheet) ) // 创建文件 file = xlsx.NewFile() // 添加分类数据工作表 for _, datacell := range self.DockerQueue.Dockers[dataIndex] { var subNamespace = util.FileNameReplace(self.subNamespace(datacell)) if _, ok := sheets[subNamespace]; !ok { // 添加工作表 sheet, err := file.AddSheet(subNamespace) if err != nil { logs.Log.Error("%v", err) continue } sheets[subNamespace] = sheet // 写入表头 row = sheets[subNamespace].AddRow() for _, title := range self.MustGetRule(datacell["RuleName"].(string)).ItemFields { row.AddCell().Value = title } if self.Spider.OutDefaultField() { row.AddCell().Value = "当前链接" row.AddCell().Value = "上级链接" row.AddCell().Value = "下载时间" } } row = sheets[subNamespace].AddRow() for _, title := range self.MustGetRule(datacell["RuleName"].(string)).ItemFields { cell = row.AddCell() vd := datacell["Data"].(map[string]interface{}) if v, ok := vd[title].(string); ok || vd[title] == nil { cell.Value = v } else { cell.Value = util.JsonString(vd[title]) } } if self.Spider.OutDefaultField() { row.AddCell().Value = datacell["Url"].(string) row.AddCell().Value = datacell["ParentUrl"].(string) row.AddCell().Value = datacell["DownloadTime"].(string) } } folder := config.TEXT_DIR + "/" + cache.StartTime.Format("2006年01月02日 15时04分05秒") filename := fmt.Sprintf("%v/%v__%v-%v.xlsx", folder, util.FileNameReplace(self.namespace()), self.sum[0], self.sum[1]) // 创建/打开目录 f2, err := os.Stat(folder) if err != nil || !f2.IsDir() { if err := os.MkdirAll(folder, 0777); err != nil { logs.Log.Error("Error: %v\n", err) } } // 保存文件 err = file.Save(filename) return } }
/************************ Kafka 输出 ***************************/ func init() { var ( kafkaSenders = map[string]*kafka.KafkaSender{} kafkaSenderLock sync.RWMutex ) var getKafkaSender = func(name string) (*kafka.KafkaSender, bool) { kafkaSenderLock.RLock() tab, ok := kafkaSenders[name] kafkaSenderLock.RUnlock() return tab, ok } var setKafkaSender = func(name string, tab *kafka.KafkaSender) { kafkaSenderLock.Lock() kafkaSenders[name] = tab kafkaSenderLock.Unlock() } DataOutput["kafka"] = func(self *Collector) error { _, err := kafka.GetProducer() if err != nil { return fmt.Errorf("kafka producer失败: %v", err) } var ( kafkas = make(map[string]*kafka.KafkaSender) namespace = util.FileNameReplace(self.namespace()) ) for _, datacell := range self.dataDocker { subNamespace := util.FileNameReplace(self.subNamespace(datacell)) topicName := joinNamespaces(namespace, subNamespace) sender, ok := kafkas[topicName] if !ok { sender, ok = getKafkaSender(topicName) if ok { kafkas[topicName] = sender } else { sender = kafka.New() sender.SetTopic(topicName) setKafkaSender(topicName, sender) kafkas[topicName] = sender } } data := make(map[string]interface{}) for _, title := range self.MustGetRule(datacell["RuleName"].(string)).ItemFields { vd := datacell["Data"].(map[string]interface{}) if v, ok := vd[title].(string); ok || vd[title] == nil { data[title] = v } else { data[title] = util.JsonString(vd[title]) } } if self.Spider.OutDefaultField() { data["url"] = datacell["Url"].(string) data["parent_url"] = datacell["ParentUrl"].(string) data["download_time"] = datacell["DownloadTime"].(string) } err := sender.Push(data) util.CheckErr(err) } kafkas = nil return nil } }