首页 > 技术文章 > Go读取论文并转换为simhahs

mengluo 2018-11-06 15:14 原文

package main

import (
	"fmt"
	_"flag"
	_ "os"
	_ "io/ioutil"
	_"strings"
	_ "path"
	 "log"
	_ "baliance.com/gooxml/document"
	"database/sql"
	_ "github.com/go-sql-driver/mysql"
	"time"
	"github.com/yanyiwu/gosimhash"
	
) 


func main(){



	t1 := time.Now()

	Mylog(doc)
	if err != nil {
		Mylog(err)
	}

    db, err := sql.Open("mysql", "root:123456@tcp(127.0.0.1:3306)/gzpg_crs_jsj?charset=utf8");
    if err != nil {
        fmt.Println(err);
    }
	sql :="select s1.paper_id,s2.title_cn,s2.abstract_cn,s2.keyword_cn,s2.title_en,s2.abstract_en,s2.keyword_en,s1.s_content from sf_content s1,sf_paper s2 where  s1.paper_id=s2.paper_id limit 10"
	rows, err := db.Query(sql)
    if err != nil {
		fmt.Println(err);
	}
	stmt, err := db.Prepare("INSERT  sim_path SET paperid=?,simcode=?")
	if err != nil {
		fmt.Println(err);
	}

	var str string
	var code string
	//查询多个
    for rows.Next() {
		var paper_id int //论文id
		var title_cn string //中文题目
		var abstract_cn string //中文摘要
		var keyword_cn string //中文关键词
		var title_en string //英文题目
		var abstract_en string //英文摘要
		var keyword_en string //英文关键词
		var s_content string//全文内容
		
        err = rows.Scan(&paper_id, &title_cn,&abstract_cn,&keyword_cn,&title_en,&abstract_en,&keyword_en,&s_content)
		str = fmt.Sprintf("%s\n 摘要:%s\n 关键词:%s\n %s\n Abstract:%s\n Keywords:%s\n %s\n",title_cn,abstract_cn,keyword_cn,title_en,abstract_en,keyword_en,s_content)
		code=simhash(str)
		res, err := stmt.Exec(paper_id, code)
		if err != nil {
			fmt.Println(err);
		}
		id, err := res.LastInsertId()
		if err != nil {
			fmt.Println(err);
		}
		fmt.Print("%s成功%s \n",id,paper_id);
	
	}
	db.Close()
	elapsed := time.Since(t1)
	log.Println("时间花费位:\n" , elapsed)

}

func simhash(str string) (string) {

	hasher := gosimhash.New("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8")
	defer hasher.Free()
	fingerprint := hasher.MakeSimhash(str, 1)
	var code string
	var s string = "0000000000000000000000000000000000000000000000000000000000000000"
	bs := []byte(s)
						
    for i := 63; i >= 0; i-- {
		
		if (fingerprint&1)==1 {

			bs[i]='1'
		} else {

			bs[i]='0'
		}
		fingerprint >>=1
	}
	code =string(bs)
	return code
}


func Mylog(v ...interface{}) {
    f, err := os.OpenFile("20181105go.log", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
	if err != nil {
		Mylog(err)
	}
    defer f.Close()
    logger := log.New(f, TAG, log.Ldate|log.Ltime|log.Lmicroseconds)
    logger.Println(v...)
}



















推荐阅读