Example #1
0
func (self *Success) flush(provider string) (sLen int) {
	self.RWMutex.Lock()
	defer self.RWMutex.Unlock()

	sLen = len(self.new)
	if sLen == 0 {
		return
	}

	switch provider {
	case "mgo":
		var docs = make([]map[string]interface{}, sLen)
		var i int
		for key := range self.new {
			docs[i] = map[string]interface{}{"_id": key}
			self.old[key] = true
			i++
		}
		mgo.Mgo(nil, "insert", map[string]interface{}{
			"Database":   MGO_DB,
			"Collection": SUCCESS_FILE,
			"Docs":       docs,
		})

	case "mysql":
		db, ok := mysql.MysqlPool.GetOne().(*mysql.MysqlSrc)
		if !ok || db == nil {
			logs.Log.Error("链接Mysql数据库超时,无法保存去重记录!")
			return 0
		}
		defer mysql.MysqlPool.Free(db)
		table := mysql.New(db.DB).
			SetTableName(SUCCESS_FILE).
			CustomPrimaryKey(`id VARCHAR(255) not null primary key`).
			Create()
		for key := range self.new {
			table.AddRow(key).Update()
			self.old[key] = true
		}

	default:
		once.Do(mkdir)
		f, _ := os.OpenFile(SUCCESS_FILE_FULL, os.O_CREATE|os.O_APPEND|os.O_RDWR, 0660)

		b, _ := json.Marshal(self.new)
		b[0] = ','
		f.Write(b[:len(b)-1])
		f.Close()

		for key := range self.new {
			self.old[key] = true
		}
	}
	self.new = make(map[string]bool)
	return
}
Example #2
0
func init() {
	Output["mysql"] = func(self *Collector, dataIndex int) {
		db, ok := mysql.MysqlPool.GetOne().(*mysql.MysqlSrc)
		if !ok || db == nil {
			logs.Log.Error("链接Mysql数据库超时,无法输出!")
			return
		}
		defer mysql.MysqlPool.Free(db)

		var mysqls = make(map[string]*mysql.MyTable)
		var namespace = util.FileNameReplace(self.namespace())

		for _, datacell := range self.DockerQueue.Dockers[dataIndex] {
			subNamespace := util.FileNameReplace(self.subNamespace(datacell))
			var tName = namespace
			if subNamespace != "" {
				tName += "__" + subNamespace
			}
			if _, ok := mysqls[subNamespace]; !ok {
				mysqls[subNamespace] = mysql.New(db.DB)
				mysqls[subNamespace].SetTableName(tName)
				for _, title := range self.MustGetRule(datacell["RuleName"].(string)).ItemFields {
					mysqls[subNamespace].AddColumn(title + ` MEDIUMTEXT`)
				}

				mysqls[subNamespace].
					AddColumn(`Url VARCHAR(255)`, `ParentUrl VARCHAR(255)`, `DownloadTime VARCHAR(50)`).
					Create()
			}

			for _, title := range self.MustGetRule(datacell["RuleName"].(string)).ItemFields {
				vd := datacell["Data"].(map[string]interface{})
				if v, ok := vd[title].(string); ok || vd[title] == nil {
					mysqls[subNamespace].AddRow(v)
				} else {
					mysqls[subNamespace].AddRow(util.JsonString(vd[title]))
				}
			}

			err := mysqls[subNamespace].
				AddRow(datacell["Url"].(string), datacell["ParentUrl"].(string), datacell["DownloadTime"].(string)).
				Update()
			util.CheckErr(err)
		}
	}
}
Example #3
0
func init() {
	Output["mysql"] = func(self *Collector, dataIndex int) {
		db, ok := mysql.MysqlPool.GetOne().(*mysql.MysqlSrc)
		if !ok || db == nil {
			logs.Log.Error("链接Mysql数据库超时,无法输出!")
			return
		}
		defer mysql.MysqlPool.Free(db)

		var mysqls = make(map[string]*mysql.MyTable)
		var namespace = util.FileNameReplace(self.namespace())

		for _, datacell := range self.DockerQueue.Dockers[dataIndex] {
			subNamespace := util.FileNameReplace(self.subNamespace(datacell))
			if _, ok := mysqls[subNamespace]; !ok {
				mysqls[subNamespace] = mysql.New(db.DB)
				mysqls[subNamespace].SetTableName("`" + namespace + "__" + subNamespace + "`")
				for _, title := range self.GetRule(datacell["RuleName"].(string)).GetOutFeild() {
					mysqls[subNamespace].AddColumn(title)
				}

				mysqls[subNamespace].
					AddColumn("Url", "ParentUrl", "DownloadTime").
					Create()
			}

			for _, title := range self.GetRule(datacell["RuleName"].(string)).GetOutFeild() {
				vd := datacell["Data"].(map[string]interface{})
				if v, ok := vd[title].(string); ok || vd[title] == nil {
					mysqls[subNamespace].AddRow(v)
				} else {
					mysqls[subNamespace].AddRow(util.JsonString(vd[title]))
				}
			}

			mysqls[subNamespace].
				AddRow(datacell["Url"].(string), datacell["ParentUrl"].(string), datacell["DownloadTime"].(string)).
				Update()
		}
	}
}
Example #4
0
func init() {
	var (
		mysqlTable     = map[string]*mysql.MyTable{}
		mysqlTableLock sync.RWMutex
	)

	var getMysqlTable = func(name string) (*mysql.MyTable, bool) {
		mysqlTableLock.RLock()
		defer mysqlTableLock.RUnlock()
		tab, ok := mysqlTable[name]
		if ok {
			return tab.Clone(), true
		}
		return nil, false
	}

	var setMysqlTable = func(name string, tab *mysql.MyTable) {
		mysqlTableLock.Lock()
		mysqlTable[name] = tab
		mysqlTableLock.Unlock()
	}

	DataOutput["mysql"] = func(self *Collector) error {
		_, err := mysql.DB()
		if err != nil {
			return fmt.Errorf("Mysql数据库链接失败: %v", err)
		}
		var (
			mysqls    = make(map[string]*mysql.MyTable)
			namespace = util.FileNameReplace(self.namespace())
		)
		for _, datacell := range self.dataDocker {
			subNamespace := util.FileNameReplace(self.subNamespace(datacell))
			tName := joinNamespaces(namespace, subNamespace)
			table, ok := mysqls[tName]
			if !ok {
				table, ok = getMysqlTable(tName)
				if ok {
					mysqls[tName] = table
				} else {
					table = mysql.New()
					table.SetTableName(tName)
					for _, title := range self.MustGetRule(datacell["RuleName"].(string)).ItemFields {
						table.AddColumn(title + ` MEDIUMTEXT`)
					}
					if self.Spider.OutDefaultField() {
						table.AddColumn(`Url VARCHAR(255)`, `ParentUrl VARCHAR(255)`, `DownloadTime VARCHAR(50)`)
					}
					if err := table.Create(); err != nil {
						logs.Log.Error("%v", err)
						continue
					} else {
						setMysqlTable(tName, table)
						mysqls[tName] = table
					}
				}
			}
			data := []string{}
			for _, title := range self.MustGetRule(datacell["RuleName"].(string)).ItemFields {
				vd := datacell["Data"].(map[string]interface{})
				if v, ok := vd[title].(string); ok || vd[title] == nil {
					data = append(data, v)
				} else {
					data = append(data, util.JsonString(vd[title]))
				}
			}
			if self.Spider.OutDefaultField() {
				data = append(data, datacell["Url"].(string), datacell["ParentUrl"].(string), datacell["DownloadTime"].(string))
			}
			table.AutoInsert(data)
		}
		for _, tab := range mysqls {
			util.CheckErr(tab.FlushInsert())
		}
		mysqls = nil
		return nil
	}
}
Example #5
0
// 先清空历史失败记录再更新
func (self *Failure) flush(provider string) (fLen int, err error) {
	self.RWMutex.Lock()
	defer self.RWMutex.Unlock()
	fLen = len(self.list)

	switch provider {
	case "mgo":
		if mgo.Error() != nil {
			err = fmt.Errorf(" *     Fail  [添加失败记录][mgo]: %v 条 [ERROR]  %v\n", fLen, mgo.Error())
			return
		}
		mgo.Call(func(src pool.Src) error {
			c := src.(*mgo.MgoSrc).DB(config.DB_NAME).C(self.tabName)
			// 删除失败记录文件
			c.DropCollection()
			if fLen == 0 {
				return nil
			}

			var docs = []interface{}{}
			for key, req := range self.list {
				docs = append(docs, map[string]interface{}{"_id": key, "failure": req.Serialize()})
			}
			c.Insert(docs...)
			return nil
		})

	case "mysql":
		_, err := mysql.DB()
		if err != nil {
			return fLen, fmt.Errorf(" *     Fail  [添加失败记录][mysql]: %v 条 [PING]  %v\n", fLen, err)
		}
		table, ok := getWriteMysqlTable(self.tabName)
		if !ok {
			table = mysql.New()
			table.SetTableName(self.tabName).CustomPrimaryKey(`id VARCHAR(8) PRIMARY KEY`).AddColumn(`failure MEDIUMTEXT`)
			setWriteMysqlTable(self.tabName, table)
			// 创建失败记录表
			err = table.Create()
			if err != nil {
				return fLen, fmt.Errorf(" *     Fail  [添加失败记录][mysql]: %v 条 [CREATE]  %v\n", fLen, err)
			}
		} else {
			// 清空失败记录表
			err = table.Truncate()
			if err != nil {
				return fLen, fmt.Errorf(" *     Fail  [添加失败记录][mysql]: %v 条 [TRUNCATE]  %v\n", fLen, err)
			}
		}

		// 添加失败记录
		for key, req := range self.list {
			table.AutoInsert([]string{key, req.Serialize()})
			err = table.FlushInsert()
			if err != nil {
				fLen--
			}
		}

	default:
		// 删除失败记录文件
		os.Remove(self.fileName)
		if fLen == 0 {
			return
		}

		f, _ := os.OpenFile(self.fileName, os.O_CREATE|os.O_WRONLY, 0777)

		docs := make(map[string]string, len(self.list))
		for key, req := range self.list {
			docs[key] = req.Serialize()
		}
		b, _ := json.Marshal(docs)
		b = bytes.Replace(b, []byte(`\u0026`), []byte(`&`), -1)
		f.Write(b)
		f.Close()
	}
	return
}
Example #6
0
// 读取成功记录
func (self *History) ReadSuccess(provider string, inherit bool) {
	self.RWMutex.Lock()
	self.provider = provider
	self.RWMutex.Unlock()

	if !inherit {
		// 不继承历史记录时
		self.Success.old = make(map[string]bool)
		self.Success.new = make(map[string]bool)
		self.Success.inheritable = false
		return

	} else if self.Success.inheritable {
		// 本次与上次均继承历史记录时
		return

	} else {
		// 上次没有继承历史记录,但本次继承时
		self.Success.old = make(map[string]bool)
		self.Success.new = make(map[string]bool)
		self.Success.inheritable = true
	}

	switch provider {
	case "mgo":
		var docs = map[string]interface{}{}
		err := mgo.Mgo(&docs, "find", map[string]interface{}{
			"Database":   config.DB_NAME,
			"Collection": self.Success.tabName,
		})
		if err != nil {
			logs.Log.Error(" *     Fail  [读取成功记录][mgo]: %v\n", err)
			return
		}
		for _, v := range docs["Docs"].([]interface{}) {
			self.Success.old[v.(bson.M)["_id"].(string)] = true
		}

	case "mysql":
		_, err := mysql.DB()
		if err != nil {
			logs.Log.Error(" *     Fail  [读取成功记录][mysql]: %v\n", err)
			return
		}
		table, ok := getReadMysqlTable(self.Success.tabName)
		if !ok {
			table = mysql.New().SetTableName(self.Success.tabName)
			setReadMysqlTable(self.Success.tabName, table)
		}
		rows, err := table.SelectAll()
		if err != nil {
			return
		}

		for rows.Next() {
			var id string
			err = rows.Scan(&id)
			self.Success.old[id] = true
		}

	default:
		f, err := os.Open(self.Success.fileName)
		if err != nil {
			return
		}
		defer f.Close()
		b, _ := ioutil.ReadAll(f)
		if len(b) == 0 {
			return
		}
		b[0] = '{'
		json.Unmarshal(append(b, '}'), &self.Success.old)
	}
	logs.Log.Informational(" *     [读取成功记录]: %v 条\n", len(self.Success.old))
}
Example #7
0
// 读取失败记录
func (self *History) ReadFailure(provider string, inherit bool) {
	self.RWMutex.Lock()
	self.provider = provider
	self.RWMutex.Unlock()

	if !inherit {
		// 不继承历史记录时
		self.Failure.list = make(map[*request.Request]bool)
		self.Failure.inheritable = false
		return

	} else if self.Failure.inheritable {
		// 本次与上次均继承历史记录时
		return

	} else {
		// 上次没有继承历史记录,但本次继承时
		self.Failure.list = make(map[*request.Request]bool)
		self.Failure.inheritable = true
	}
	var fLen int
	switch provider {
	case "mgo":
		if mgo.Error() != nil {
			logs.Log.Error(" *     Fail  [读取失败记录][mgo]: %v\n", mgo.Error())
			return
		}

		var docs = []interface{}{}
		mgo.Call(func(src pool.Src) error {
			c := src.(*mgo.MgoSrc).DB(config.DB_NAME).C(self.Failure.tabName)
			return c.Find(nil).All(&docs)
		})

		fLen = len(docs)

		for _, v := range docs {
			failure := v.(bson.M)["_id"].(string)
			req, err := request.UnSerialize(failure)
			if err != nil {
				continue
			}
			self.Failure.list[req] = true
		}

	case "mysql":
		_, err := mysql.DB()
		if err != nil {
			logs.Log.Error(" *     Fail  [读取失败记录][mysql]: %v\n", err)
			return
		}
		table, ok := getReadMysqlTable(self.Failure.tabName)
		if !ok {
			table = mysql.New().SetTableName(self.Failure.tabName)
			setReadMysqlTable(self.Failure.tabName, table)
		}
		rows, err := table.SelectAll()
		if err != nil {
			return
		}

		for rows.Next() {
			var id int
			var failure string
			err = rows.Scan(&id, &failure)
			req, err := request.UnSerialize(failure)
			if err != nil {
				continue
			}
			self.Failure.list[req] = true
			fLen++
		}

	default:
		f, err := os.Open(self.Failure.fileName)
		if err != nil {
			return
		}
		b, _ := ioutil.ReadAll(f)
		f.Close()

		if len(b) == 0 {
			return
		}

		docs := []string{}
		json.Unmarshal(b, &docs)

		fLen = len(docs)

		for _, s := range docs {
			req, err := request.UnSerialize(s)
			if err != nil {
				continue
			}
			self.Failure.list[req] = true
		}
	}

	logs.Log.Informational(" *     [读取失败记录]: %v 条\n", fLen)
}
Example #8
0
// 读取成功记录
func (self *History) ReadSuccess(provider string, inherit bool) {
	self.RWMutex.Lock()
	self.provider = provider
	self.RWMutex.Unlock()

	if !inherit {
		// 不继承历史记录时
		self.Success.old = make(map[string]bool)
		self.Success.new = make(map[string]bool)
		self.Success.inheritable = false
		return

	} else if self.Success.inheritable {
		// 本次与上次均继承历史记录时
		return

	} else {
		// 上次没有继承历史记录,但本次继承时
		self.Success.old = make(map[string]bool)
		self.Success.new = make(map[string]bool)
		self.Success.inheritable = true
	}

	switch provider {
	case "mgo":
		var docs = map[string]interface{}{}
		err := mgo.Mgo(&docs, "find", map[string]interface{}{
			"Database":   MGO_DB,
			"Collection": SUCCESS_FILE,
		})
		if err != nil {
			logs.Log.Error("从mgo读取成功记录: %v", err)
			return
		}
		for _, v := range docs["Docs"].([]interface{}) {
			self.Success.old[v.(bson.M)["_id"].(string)] = true
		}

	case "mysql":
		db, ok := mysql.MysqlPool.GetOne().(*mysql.MysqlSrc)
		if !ok || db == nil {
			// logs.Log.Error("链接Mysql数据库超时,无法读取成功记录!")
			return
		}
		defer mysql.MysqlPool.Free(db)
		rows, err := mysql.New(db.DB).
			SetTableName("`" + SUCCESS_FILE + "`").
			SelectAll()
		if err != nil {
			return
		}

		for rows.Next() {
			var id string
			err = rows.Scan(&id)
			self.Success.old[id] = true
		}

	default:
		f, err := os.Open(SUCCESS_FILE_FULL)
		if err != nil {
			return
		}
		defer f.Close()
		b, _ := ioutil.ReadAll(f)
		b[0] = '{'
		json.Unmarshal(append(b, '}'), &self.Success.old)
	}
	logs.Log.Informational(" *     读出 %v 条成功记录\n", len(self.Success.old))
}
Example #9
0
// 读取失败记录
func (self *History) ReadFailure(provider string, inherit bool) {
	self.RWMutex.Lock()
	self.provider = provider
	self.RWMutex.Unlock()

	if !inherit {
		// 不继承历史记录时
		self.Failure.list = make(map[string]map[string]bool)
		self.Failure.inheritable = false
		return

	} else if self.Failure.inheritable {
		// 本次与上次均继承历史记录时
		return

	} else {
		// 上次没有继承历史记录,但本次继承时
		self.Failure.list = make(map[string]map[string]bool)
		self.Failure.inheritable = true
	}
	var fLen int
	switch provider {
	case "mgo":
		var docs = []interface{}{}
		s, c, err := mgo.Open(MGO_DB, FAILURE_FILE)
		if err != nil {
			logs.Log.Error("从mgo读取成功记录: %v", err)
			return
		}
		c.Find(nil).All(&docs)

		mgo.Close(s)

		for _, v := range docs {
			failure := v.(bson.M)["_id"].(string)
			req, err := context.UnSerialize(failure)
			if err != nil {
				continue
			}
			spName := req.GetSpiderName()
			if _, ok := self.Failure.list[spName]; !ok {
				self.Failure.list[spName] = make(map[string]bool)
			}
			self.Failure.list[spName][failure] = true
			fLen++
		}

	case "mysql":
		db, ok := mysql.MysqlPool.GetOne().(*mysql.MysqlSrc)
		if !ok || db == nil {
			logs.Log.Error("链接Mysql数据库超时,无法读取成功记录!")
			return
		}
		rows, err := mysql.New(db.DB).
			SetTableName("`" + FAILURE_FILE + "`").
			SelectAll()
		if err != nil {
			// logs.Log.Error("读取Mysql数据库中成功记录失败:%v", err)
			return
		}

		mysql.MysqlPool.Free(db)

		for rows.Next() {
			var id int
			var failure string
			err = rows.Scan(&id, &failure)
			req, err := context.UnSerialize(failure)
			if err != nil {
				continue
			}
			spName := req.GetSpiderName()
			if _, ok := self.Failure.list[spName]; !ok {
				self.Failure.list[spName] = make(map[string]bool)
			}
			self.Failure.list[spName][failure] = true
			fLen++
		}

	default:
		f, err := os.Open(FAILURE_FILE_FULL)
		if err != nil {
			return
		}
		b, _ := ioutil.ReadAll(f)
		f.Close()

		b[0] = '{'
		json.Unmarshal(
			append(b, '}'),
			&self.Failure.list,
		)
		for _, v := range self.Failure.list {
			fLen += len(v)
		}

	}
	logs.Log.Informational(" *     读出 %v 条失败记录\n", fLen)
}
Example #10
0
func (self *Success) flush(provider string) (sLen int, err error) {
	self.RWMutex.Lock()
	defer self.RWMutex.Unlock()

	sLen = len(self.new)
	if sLen == 0 {
		return
	}

	switch provider {
	case "mgo":
		if mgo.Error() != nil {
			err = fmt.Errorf(" *     Fail  [添加成功记录][mgo]: %v 条 [ERROR]  %v\n", sLen, mgo.Error())
			return
		}
		var docs = make([]map[string]interface{}, sLen)
		var i int
		for key := range self.new {
			docs[i] = map[string]interface{}{"_id": key}
			self.old[key] = true
			i++
		}
		err := mgo.Mgo(nil, "insert", map[string]interface{}{
			"Database":   config.DB_NAME,
			"Collection": self.tabName,
			"Docs":       docs,
		})
		if err != nil {
			err = fmt.Errorf(" *     Fail  [添加成功记录][mgo]: %v 条 [ERROR]  %v\n", sLen, err)
		}

	case "mysql":
		_, err := mysql.DB()
		if err != nil {
			return sLen, fmt.Errorf(" *     Fail  [添加成功记录][mysql]: %v 条 [ERROR]  %v\n", sLen, err)
		}
		table, ok := getWriteMysqlTable(self.tabName)
		if !ok {
			table = mysql.New()
			table.SetTableName(self.tabName).CustomPrimaryKey(`id VARCHAR(255) not null primary key`)
			err = table.Create()
			if err != nil {
				return sLen, fmt.Errorf(" *     Fail  [添加成功记录][mysql]: %v 条 [ERROR]  %v\n", sLen, err)
			}
			setWriteMysqlTable(self.tabName, table)
		}
		for key := range self.new {
			table.AutoInsert([]string{key})
			self.old[key] = true
		}
		err = table.FlushInsert()
		if err != nil {
			return sLen, fmt.Errorf(" *     Fail  [添加成功记录][mysql]: %v 条 [ERROR]  %v\n", sLen, err)
		}

	default:
		f, _ := os.OpenFile(self.fileName, os.O_CREATE|os.O_APPEND|os.O_RDWR, 0777)

		b, _ := json.Marshal(self.new)
		b[0] = ','
		f.Write(b[:len(b)-1])
		f.Close()

		for key := range self.new {
			self.old[key] = true
		}
	}
	self.new = make(map[string]bool)
	return
}
Example #11
0
// 先清空历史失败记录再更新
func (self *Failure) flush(provider string) (fLen int, err error) {
	self.RWMutex.Lock()
	defer self.RWMutex.Unlock()
	fLen = len(self.list)

	switch provider {
	case "mgo":
		if mgo.Error() != nil {
			err = fmt.Errorf(" *     Fail  [添加失败记录][mgo]: %v 条 [ERROR]  %v\n", fLen, mgo.Error())
			return
		}
		mgo.Call(func(src pool.Src) error {
			c := src.(*mgo.MgoSrc).DB(config.DB_NAME).C(self.tabName)
			// 删除失败记录文件
			c.DropCollection()
			if fLen == 0 {
				return nil
			}

			var docs = []interface{}{}
			for req := range self.list {
				docs = append(docs, map[string]interface{}{"_id": req.Serialize()})
			}
			c.Insert(docs...)
			return nil
		})

	case "mysql":
		db, err := mysql.DB()
		if err != nil {
			return fLen, fmt.Errorf(" *     Fail  [添加失败记录][mysql]: %v 条 [ERROR]  %v\n", fLen, err)
		}
		// 删除失败记录文件
		stmt, err := db.Prepare(`DROP TABLE ` + self.tabName)
		if err != nil {
			return fLen, fmt.Errorf(" *     Fail  [添加失败记录][mysql]: %v 条 [ERROR]  %v\n", fLen, err)
		}
		stmt.Exec()
		if fLen == 0 {
			return fLen, nil
		}
		table, ok := getWriteMysqlTable(self.tabName)
		if !ok {
			table = mysql.New()
			table.SetTableName(self.tabName).AddColumn(`failure MEDIUMTEXT`)
			setWriteMysqlTable(self.tabName, table)
		}
		// 添加失败请求
		err = table.Create()
		if err != nil {
			return fLen, fmt.Errorf(" *     Fail  [添加失败记录][mysql]: %v 条 [ERROR]  %v\n", fLen, err)
		}
		for req := range self.list {
			table.AutoInsert([]string{req.Serialize()})
		}
		err = table.FlushInsert()
		if err != nil {
			return fLen, fmt.Errorf(" *     Fail  [添加失败记录][mysql]: %v 条 [ERROR]  %v\n", fLen, err)
		}

	default:
		// 删除失败记录文件
		os.Remove(self.fileName)
		if fLen == 0 {
			return
		}

		f, _ := os.OpenFile(self.fileName, os.O_CREATE|os.O_WRONLY, 0777)

		docs := make([]string, len(self.list))
		i := 0
		for req := range self.list {
			docs[i] = req.Serialize()
			i++
		}
		b, _ := json.Marshal(docs)
		b = bytes.Replace(b, []byte(`\u0026`), []byte(`&`), -1)
		f.Write(b)
		f.Close()
	}
	return
}
Example #12
0
func (self *Failure) flush(provider string) (fLen int) {
	self.RWMutex.Lock()
	defer self.RWMutex.Unlock()
	for _, val := range self.list {
		fLen += len(val)
	}

	switch provider {
	case "mgo":
		s, c, err := mgo.Open(MGO_DB, FAILURE_FILE)
		if err != nil {
			logs.Log.Error("从mgo读取成功记录: %v", err)
			return
		}
		defer mgo.Close(s)

		// 删除失败记录文件
		c.DropCollection()
		if fLen == 0 {
			return
		}

		var docs = []interface{}{}
		for _, val := range self.list {
			for key := range val {
				docs = append(docs, map[string]interface{}{"_id": key})
			}
		}
		c.Insert(docs...)

	case "mysql":
		db, ok := mysql.MysqlPool.GetOne().(*mysql.MysqlSrc)
		if !ok || db == nil {
			logs.Log.Error("链接Mysql数据库超时,无法保存去重记录!")
			return 0
		}

		// 删除失败记录文件
		stmt, err := db.DB.Prepare(`DROP TABLE ` + FAILURE_FILE)
		if err != nil {
			return
		}
		stmt.Exec()
		if fLen == 0 {
			return
		}

		table := mysql.New(db.DB).
			SetTableName(FAILURE_FILE).
			AddColumn(`failure MEDIUMTEXT`).
			Create()
		for _, val := range self.list {
			for key := range val {
				table.AddRow(key).Update()
			}
		}
		mysql.MysqlPool.Free(db)

	default:
		// 删除失败记录文件
		os.Remove(FAILURE_FILE_FULL)
		if fLen == 0 {
			return
		}

		f, _ := os.OpenFile(FAILURE_FILE_FULL, os.O_CREATE|os.O_WRONLY, 0660)

		b, _ := json.Marshal(self.list)
		b[0] = ','
		f.Write(b[:len(b)-1])
		f.Close()
	}
	return
}
Example #13
0
func (self *Deduplication) Submit(provider string) {
	self.Mutex.Lock()
	defer self.Unlock()

	self.provider = provider

	if len(self.sampling.new) == 0 {
		return
	}

	switch self.provider {
	case "mgo":
		var docs = make([]map[string]interface{}, len(self.sampling.new))
		var i int
		for key := range self.sampling.new {
			docs[i] = map[string]interface{}{"_id": key}
			self.sampling.old[key] = true
			i++
		}
		mgo.Mgo(nil, "insert", map[string]interface{}{
			"Database":   config.MGO.DB,
			"Collection": config.DEDUPLICATION.FILE_NAME,
			"Docs":       docs,
		})

	case "mysql":
		db, ok := mysql.MysqlPool.GetOne().(*mysql.MysqlSrc)
		if !ok || db == nil {
			logs.Log.Error("链接Mysql数据库超时,无法保存去重记录!")
			return
		}
		defer mysql.MysqlPool.Free(db)
		table := mysql.New(db.DB).
			SetTableName(config.DEDUPLICATION.FILE_NAME).
			CustomPrimaryKey(`id VARCHAR(255) not null primary key`).
			Create()
		for key := range self.sampling.new {
			table.AddRow(key).Update()
			self.sampling.old[key] = true
		}

	default:
		p, _ := path.Split(config.COMM_PATH.CACHE + "/" + config.DEDUPLICATION.FILE_NAME)
		// 创建/打开目录
		d, err := os.Stat(p)
		if err != nil || !d.IsDir() {
			if err := os.MkdirAll(p, 0777); err != nil {
				logs.Log.Error("Error: %v\n", err)
			}
		}

		f, _ := os.OpenFile(config.COMM_PATH.CACHE+"/"+config.DEDUPLICATION.FILE_NAME, os.O_CREATE|os.O_APPEND|os.O_RDWR, 0660)

		b, _ := json.Marshal(self.sampling.new)
		b[0] = ','
		f.Write(b[:len(b)-1])
		f.Close()

		for key := range self.sampling.new {
			self.sampling.old[key] = true
		}
	}
	logs.Log.Informational(" *     新增 %v 条去重样本\n", len(self.sampling.new))
	self.sampling.new = make(map[string]bool)
}
Example #14
0
func (self *Deduplication) Update(provider string, inherit bool) {
	self.Mutex.Lock()
	defer self.Unlock()

	self.provider = provider

	if !inherit {
		// 不继承历史记录时
		self.sampling.old = make(map[string]bool)
		self.sampling.new = make(map[string]bool)
		self.lastIsInherit = false
		return

	} else if self.lastIsInherit {
		// 本次与上次均继承历史记录时
		return

	} else {
		// 上次没有继承历史记录,但本次继承时
		self.sampling.old = make(map[string]bool)
		self.sampling.new = make(map[string]bool)
		self.lastIsInherit = true
	}

	switch self.provider {
	case "mgo":
		var docs = map[string]interface{}{}
		err := mgo.Mgo(&docs, "find", map[string]interface{}{
			"Database":   config.MGO.DB,
			"Collection": config.DEDUPLICATION.FILE_NAME,
		})
		if err != nil {
			logs.Log.Error("去重读取mgo: %v", err)
			return
		}
		for _, v := range docs["Docs"].([]interface{}) {
			self.sampling.old[v.(bson.M)["_id"].(string)] = true
		}

	case "mysql":
		db, ok := mysql.MysqlPool.GetOne().(*mysql.MysqlSrc)
		if !ok || db == nil {
			logs.Log.Error("链接Mysql数据库超时,无法读取去重记录!")
			return
		}
		defer mysql.MysqlPool.Free(db)
		rows, err := mysql.New(db.DB).
			SetTableName("`" + config.DEDUPLICATION.FILE_NAME + "`").
			SelectAll()
		if err != nil {
			// logs.Log.Error("读取Mysql数据库中去重记录失败:%v", err)
			return
		}

		for rows.Next() {
			var id string
			err = rows.Scan(&id)
			self.sampling.old[id] = true
		}

	default:
		f, err := os.Open(config.COMM_PATH.CACHE + "/" + config.DEDUPLICATION.FILE_NAME)
		if err != nil {
			return
		}
		defer f.Close()
		b, _ := ioutil.ReadAll(f)
		b[0] = '{'
		json.Unmarshal(
			append(b, '}'),
			&self.sampling.old,
		)
	}
	logs.Log.Informational(" *     读出 %v 条去重样本\n", len(self.sampling.old))
}