package job import ( "fmt" "git.links123.net/Slate/CorpusAI/service" "git.links123.net/Slate/CorpusAI/service/store/cache" "git.links123.net/Slate/CorpusAI/service/store/mysql" "git.links123.net/Slate/CorpusAI/config" "github.com/jinzhu/gorm" _ "github.com/go-sql-driver/mysql" "github.com/spf13/cobra" "github.com/gocolly/colly" "net/url" "strings" "html" ) type TtsRaw struct { ID int64 Text string //翻译的文本 UniqKey string //唯一键值 Status int64 Remark string } func (TtsRaw) TableName() string { return "lnk_corpus_tts_raw" } type langType struct { languageCode string; voiceName string; } type Phrase struct { Text string Paraphrase string Type int64 Word string } func (Phrase) TableName() string { return "lnk_corpus_phrase_spider" } func RunCommand() *cobra.Command { cmd := &cobra.Command{ Use: "job", Short: "Run the job service", Run: func(cmd *cobra.Command, args []string) { //fmt.Println("Echo: " + strings.Join(args, " ")) minId := args[0] maxId := args[1] dbConfig := config.C.DB settings := dbConfig.User+":"+dbConfig.Password+"@tcp("+dbConfig.Host+")/"+dbConfig.Name+"?charset=utf8&parseTime=True&loc=Local" var ttsRawList []TtsRaw db, err := gorm.Open("mysql", settings) if err != nil { panic("failed to connect database") } db.Where("id <= ? and id >= ?", maxId, minId).Find(&ttsRawList) for _, ttsRaw := range ttsRawList { word := ttsRaw.Text c := colly.NewCollector( // Visit only domains: hackerspaces.org, wiki.hackerspaces.org colly.AllowedDomains("dict.cn", "m.dict.cn"), ) // 例句 c.OnHTML("div[class=\"layout sort\"]", func(e *colly.HTMLElement) { e.ForEach("ol", func(_ int, eol *colly.HTMLElement) { eol.ForEach("li", func(_ int, el *colly.HTMLElement) { //fmt.Println(el.DOM.Html()) liText, _ := el.DOM.Html() liData := strings.Split(html.UnescapeString(liText), "
") example := ReplaceTrim(liData[0]) exampleParaphrase := ReplaceTrim(liData[1]) examplePhrase := Phrase{Text:example, Paraphrase:exampleParaphrase, Type: 0, Word:word} fmt.Println(examplePhrase) db.Create(&examplePhrase) }) }) }) // 词汇搭配, 短语 c.OnHTML("div[class=\"layout coll\"]", func(e *colly.HTMLElement) { e.ForEach("li", func(_ int, el *colly.HTMLElement) { if el.ChildAttr("a", "href") != "" { phrase := ReplaceTrim(el.ChildText("a")) paraphrase := ReplaceTrim(el.Text) paraphrase = strings.Replace(paraphrase, phrase, "", 1) newPhrase := Phrase{Text:phrase, Paraphrase:paraphrase, Type: 1, Word:word} fmt.Println(newPhrase) db.Create(&newPhrase) } }) }) c.OnHTML("div[class=\"layout anno\"]", func(e *colly.HTMLElement) { e.ForEach("li", func(_ int, el *colly.HTMLElement) { if el.ChildAttr("a", "href") != "" { phrase := ReplaceTrim(el.ChildText("a")) paraphrase := ReplaceTrim(el.Text) paraphrase = strings.Replace(paraphrase, phrase, "", 1) newPhrase := Phrase{Text:phrase, Paraphrase:paraphrase, Type: 1, Word:word} fmt.Println(newPhrase) db.Create(&newPhrase) } }) }) c.OnRequest(func(r *colly.Request) { fmt.Println("Visiting", r.URL.String()) }) c.Visit("http://dict.cn/"+word) //if result == true { ttsRaw.Status = 1 //更新成功 //} else { // ttsRaw.Status = -1 // //更新错误状态和msg // ttsRaw.Remark = msg //} fmt.Println(ttsRaw) db.Save(&ttsRaw) } db.Close() }, } return cmd } func SyncTtsOss(text string, speed float64, pitch float64) (bool, string) { text, _ = url.QueryUnescape(text) text = strings.Trim(text , "") if text == "" { return false, "Error: text null" } typeMap := make(map[int]langType) typeMap[1] = langType{"en-US","en-US-Wavenet-B"} typeMap[2] = langType{"en-US","en-US-Wavenet-C"} typeMap[3] = langType{"en-GB","en-GB-Wavenet-B"} typeMap[4] = langType{"en-GB","en-GB-Wavenet-C"} for _, lang := range typeMap { ossObjectKey := service.GetTtsOssKey(text, lang.voiceName, lang.languageCode, speed, pitch) textKey := cache.GetTextKey(ossObjectKey) AudioContent, err := service.TextToSpeech(text, lang.voiceName, lang.languageCode, speed, pitch) if err != nil { return false, "TextToSpeech Error:" + err.Error() } uploadResult, err := service.UploadHkOss(ossObjectKey, AudioContent) if uploadResult == true { uploadResult, err = service.UploadOss(ossObjectKey, AudioContent) if uploadResult == true { //hk&cn节点oss都同步成功, set db mysql.CreateCorpusTts(text, textKey, lang.languageCode, lang.voiceName, ossObjectKey, speed, pitch) } } if err != nil { return false, "UploadHkOss Error" + err.Error() } } return true, "" } func ReplaceTrim(str string) string { str = strings.Replace(str, "\n", "", -1) str = strings.Replace(str, "\t", "", -1) return strings.Trim(str, "") }