Ejemplo n.º 1
0
func (s *server) internalCrawl(in *pb.CrawlRequest) (*pb.CrawlResponse, error) {
	response := pb.CrawlResponse{}

	// 仅当 method 为 GET 或者 HEAD 时才读取文件缓存
	var cacheFilename, metadataFilename string
	if in.Method == pb.Method_GET || in.Method == pb.Method_HEAD {
		// 生成缓存文件名
		hasher := sha1.New()
		hasher.Write([]byte(in.Url))
		sha := base64.URLEncoding.EncodeToString(hasher.Sum(nil))
		cacheFilename = fmt.Sprintf("%s/page-%s", *pageCacheDir, sha)
		metadataFilename = fmt.Sprintf("%s/metadata-%s", *pageCacheDir, sha)

		// 检查页面的 metadata 和 page 是否已经被抓取
		// 如果有任何异常或者页面过旧,则重新抓取
		if _, err := os.Stat(metadataFilename); err == nil {
			serializedMetadata, err := ioutil.ReadFile(metadataFilename)
			if err == nil {
				response.Metadata = &pb.Metadata{}
				err = proto.Unmarshal(serializedMetadata, response.Metadata)
				if err == nil {
					if in.OnlyReturnMetadata {
						return &response, nil
					}
					if time.Now().UnixNano()-response.Metadata.LastCrawlTimestamp <
						in.RecrawlTtl*int64(time.Millisecond) {
						if _, err := os.Stat(cacheFilename); err == nil {
							content, err := ioutil.ReadFile(cacheFilename)
							if err == nil {
								contentReader := bytes.NewReader(content)
								reader, err := gzip.NewReader(contentReader)
								if err == nil {
									if b, err := ioutil.ReadAll(reader); err == nil {
										response.Content = string(b)
										return &response, nil
									}
									reader.Close()
								}
							}
						}
					}
				}
			}
		}
	}

	// 获取 http 连接
	tr := &http.Transport{
		TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
	}
	client := &http.Client{
		Transport: tr,
	}
	if in.Timeout > 0 {
		client.Timeout = time.Millisecond * time.Duration(in.Timeout)
	}

	// 根据不同的 method 类型,分别调用不同 HTTP 方法
	var resp *http.Response
	var err error
	if in.Method == pb.Method_GET {
		resp, err = client.Get(in.Url)
	} else if in.Method == pb.Method_HEAD {
		resp, err = client.Head(in.Url)
	} else if in.Method == pb.Method_POST {
		buff := bytes.NewBufferString(in.PostBody)
		resp, err = client.Post(in.Url, in.BodyType, buff)
	} else if in.Method == pb.Method_POSTFORM {
		formKeys := url.Values{}
		for _, kv := range in.FormValues {
			formKeys.Add(kv.Key, kv.Value)
		}
		resp, err = client.PostForm(in.Url, formKeys)
	}
	if err != nil {
		return nil, err
	}

	// 只有当 method 不为 HEAD 时才读取页面内容
	var body []byte
	if in.Method != pb.Method_HEAD {
		// 读取页面内容
		var err error
		body, err = ioutil.ReadAll(resp.Body)
		resp.Body.Close()
		if err != nil {
			return nil, err
		}

		// 充填 response
		if !in.OnlyReturnMetadata {
			response.Content = string(body)
		}
		response.IsFreshCrawl = true
	}

	// 充填 metadata
	response.Metadata = &pb.Metadata{}
	response.Metadata.LastCrawlTimestamp = time.Now().UnixNano()
	response.Metadata.Length = uint32(len(body))
	for key, vs := range resp.Header {
		for _, v := range vs {
			response.Metadata.Header = append(response.Metadata.Header, &pb.KV{
				Key:   key,
				Value: v,
			})
		}
	}
	response.Metadata.Status = resp.Status
	response.Metadata.StatusCode = int32(resp.StatusCode)

	// 仅当 method 为 GET 才将页面内容写入文件
	if in.Method == pb.Method_GET {
		var buf bytes.Buffer
		writer := gzip.NewWriter(&buf)
		writer.Write(body)
		writer.Close()
		if err := ioutil.WriteFile(cacheFilename, buf.Bytes(), 0644); err != nil {
			return &response, err
		}
	}

	// 仅当 method 为 GET 或者 HEAD 时才将 metadata 写入文件
	if in.Method == pb.Method_GET || in.Method == pb.Method_HEAD {
		serializedMetadata, err := proto.Marshal(response.Metadata)
		if err != nil {
			return &response, err
		}
		if err := ioutil.WriteFile(metadataFilename, serializedMetadata, 0644); err != nil {
			return &response, err
		}
	}

	return &response, nil
}
Ejemplo n.º 2
0
func (s *server) internalCrawl(in *pb.CrawlRequest) (*pb.CrawlResponse, error) {
	response := pb.CrawlResponse{}

	// 文件名
	hasher := sha1.New()
	hasher.Write([]byte(in.Url))
	sha := base64.URLEncoding.EncodeToString(hasher.Sum(nil))
	cacheFilename := fmt.Sprintf("%s/page-%s", *pageCacheDir, sha)
	metadataFilename := fmt.Sprintf("%s/metadata-%s", *pageCacheDir, sha)

	// 检查页面的 metadata 和 page 是否已经被抓取
	// 如果有任何异常或者页面过旧,则重新抓取
	if _, err := os.Stat(metadataFilename); err == nil {
		serializedMetadata, err := ioutil.ReadFile(metadataFilename)
		if err == nil {
			response.Metadata = &pb.Metadata{}
			err = proto.Unmarshal(serializedMetadata, response.Metadata)
			if err == nil {
				if in.OnlyReturnMetadata {
					return &response, nil
				}
				if time.Now().UnixNano()-response.Metadata.LastCrawlTimestamp < in.CrawlFrequency*int64(time.Millisecond) {
					if _, err := os.Stat(cacheFilename); err == nil {
						content, err := ioutil.ReadFile(cacheFilename)
						if err == nil {
							contentReader := bytes.NewReader(content)
							reader, err := gzip.NewReader(contentReader)
							if err == nil {
								if b, err := ioutil.ReadAll(reader); err == nil {
									response.Content = string(b)
									return &response, nil
								}
								reader.Close()
							}
						}
					}
				}
			}
		}
	}

	// 获取 http 连接
	tr := &http.Transport{
		TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
	}
	client := &http.Client{
		Transport: tr,
	}
	if in.Timeout > 0 {
		client.Timeout = time.Millisecond * time.Duration(in.Timeout)
	}
	resp, err := client.Get(in.Url)
	if err != nil {
		return nil, err
	}

	// 读取页面内容
	body, err := ioutil.ReadAll(resp.Body)
	resp.Body.Close()
	if err != nil {
		return nil, err
	}

	// 充填 response
	if !in.OnlyReturnMetadata {
		response.Content = string(body)
	}
	response.IsFreshCrawl = true

	// 充填 metadata
	response.Metadata = &pb.Metadata{}
	response.Metadata.LastCrawlTimestamp = time.Now().UnixNano()
	response.Metadata.Length = uint32(len(body))

	// 将 page content 写入文件
	var buf bytes.Buffer
	writer := gzip.NewWriter(&buf)
	writer.Write(body)
	writer.Close()
	if err := ioutil.WriteFile(cacheFilename, buf.Bytes(), 0644); err != nil {
		return &response, err
	}

	// 将 metadata 写入文件
	serializedMetadata, err := proto.Marshal(response.Metadata)
	if err != nil {
		return &response, err
	}
	if err := ioutil.WriteFile(metadataFilename, serializedMetadata, 0644); err != nil {
		return &response, err
	}

	return &response, nil
}
Ejemplo n.º 3
0
func (s *server) internalCrawl(in *pb.CrawlRequest) (*pb.CrawlResponse, error) {
	response := pb.CrawlResponse{}

	// 获取 http 连接
	tr := &http.Transport{
		TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
	}
	client := &http.Client{
		Transport: tr,
	}
	if in.Timeout > 0 {
		client.Timeout = time.Millisecond * time.Duration(in.Timeout)
	}

	// 根据不同的 method 类型,分别调用不同 HTTP 方法
	var err error
	var req *http.Request
	if in.Method == pb.Method_GET {
		req, err = http.NewRequest("GET", in.Url, nil)
	} else if in.Method == pb.Method_HEAD {
		req, err = http.NewRequest("HEAD", in.Url, nil)
	} else if in.Method == pb.Method_POST {
		buff := bytes.NewBufferString(in.PostBody)
		req, err = http.NewRequest("POST", in.Url, buff)
		req.Header.Add("Content-Type", in.BodyType)
		req.Header.Add("Content-Length", strconv.Itoa(len(in.PostBody)))
	}

	// 充填 header
	for _, header := range in.Header {
		req.Header.Add(header.Key, header.Value)
	}

	// 发送请求
	resp, err := client.Do(req)
	if err != nil {
		return nil, err
	}

	// 只有当 method 不为 HEAD 时才读取页面内容
	var body []byte
	if in.Method != pb.Method_HEAD {
		// 读取页面内容
		var err error
		body, err = ioutil.ReadAll(resp.Body)
		resp.Body.Close()
		if err != nil {
			return nil, err
		}

		// 充填 response
		if !in.OnlyReturnMetadata {
			response.Content = string(body)
		}
	}

	// 充填 metadata
	response.Metadata = &pb.Metadata{}
	response.Metadata.Length = uint32(len(body))
	for key, vs := range resp.Header {
		for _, v := range vs {
			response.Metadata.Header = append(response.Metadata.Header, &pb.KV{
				Key:   key,
				Value: v,
			})
		}
	}
	response.Metadata.Status = resp.Status
	response.Metadata.StatusCode = int32(resp.StatusCode)

	return &response, nil
}