123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205 |
- package job
-
- import (
- "fmt"
- "git.links123.net/Slate/CorpusAI/service"
- "git.links123.net/Slate/CorpusAI/service/store/cache"
- "git.links123.net/Slate/CorpusAI/service/store/mysql"
- "git.links123.net/Slate/CorpusAI/config"
- "github.com/jinzhu/gorm"
- _ "github.com/go-sql-driver/mysql"
- "github.com/spf13/cobra"
- "github.com/gocolly/colly"
- "net/url"
- "strings"
- "html"
- )
-
- type TtsRaw struct {
- ID int64
- Text string //翻译的文本
- UniqKey string //唯一键值
- Status int64
- Remark string
- }
-
- func (TtsRaw) TableName() string {
- return "lnk_corpus_tts_raw"
- }
-
- type langType struct {
- languageCode string;
- voiceName string;
- }
-
-
- type Phrase struct {
- Text string
- Paraphrase string
- Type int64
- Word string
- }
-
- func (Phrase) TableName() string {
- return "lnk_corpus_phrase_spider"
- }
-
- func RunCommand() *cobra.Command {
- cmd := &cobra.Command{
- Use: "job",
- Short: "Run the job service",
- Run: func(cmd *cobra.Command, args []string) {
- //fmt.Println("Echo: " + strings.Join(args, " "))
-
- minId := args[0]
- maxId := args[1]
-
- dbConfig := config.C.DB
- settings := dbConfig.User+":"+dbConfig.Password+"@tcp("+dbConfig.Host+")/"+dbConfig.Name+"?charset=utf8&parseTime=True&loc=Local"
-
- var ttsRawList []TtsRaw
-
- db, err := gorm.Open("mysql", settings)
- if err != nil {
- panic("failed to connect database")
- }
-
- db.Where("id <= ? and id >= ?", maxId, minId).Find(&ttsRawList)
-
- for _, ttsRaw := range ttsRawList {
-
- word := ttsRaw.Text
- c := colly.NewCollector(
- // Visit only domains: hackerspaces.org, wiki.hackerspaces.org
- colly.AllowedDomains("dict.cn", "m.dict.cn"),
- )
-
- // 例句
- c.OnHTML("div[class=\"layout sort\"]", func(e *colly.HTMLElement) {
- e.ForEach("ol", func(_ int, eol *colly.HTMLElement) {
- eol.ForEach("li", func(_ int, el *colly.HTMLElement) {
- //fmt.Println(el.DOM.Html())
- liText, _ := el.DOM.Html()
- liData := strings.Split(html.UnescapeString(liText), "<br/>")
- example := ReplaceTrim(liData[0])
- exampleParaphrase := ReplaceTrim(liData[1])
-
- examplePhrase := Phrase{Text:example, Paraphrase:exampleParaphrase, Type: 0, Word:word}
- fmt.Println(examplePhrase)
- db.Create(&examplePhrase)
- })
- })
- })
-
- // 词汇搭配, 短语
- c.OnHTML("div[class=\"layout coll\"]", func(e *colly.HTMLElement) {
-
- e.ForEach("li", func(_ int, el *colly.HTMLElement) {
- if el.ChildAttr("a", "href") != "" {
- phrase := ReplaceTrim(el.ChildText("a"))
- paraphrase := ReplaceTrim(el.Text)
- paraphrase = strings.Replace(paraphrase, phrase, "", 1)
-
- newPhrase := Phrase{Text:phrase, Paraphrase:paraphrase, Type: 1, Word:word}
- fmt.Println(newPhrase)
- db.Create(&newPhrase)
- }
- })
- })
-
- c.OnHTML("div[class=\"layout anno\"]", func(e *colly.HTMLElement) {
-
- e.ForEach("li", func(_ int, el *colly.HTMLElement) {
- if el.ChildAttr("a", "href") != "" {
- phrase := ReplaceTrim(el.ChildText("a"))
- paraphrase := ReplaceTrim(el.Text)
- paraphrase = strings.Replace(paraphrase, phrase, "", 1)
-
- newPhrase := Phrase{Text:phrase, Paraphrase:paraphrase, Type: 1, Word:word}
- fmt.Println(newPhrase)
- db.Create(&newPhrase)
- }
- })
- })
- c.OnRequest(func(r *colly.Request) {
- fmt.Println("Visiting", r.URL.String())
- })
-
- c.Visit("http://dict.cn/"+word)
-
- //if result == true {
- ttsRaw.Status = 1
- //更新成功
- //} else {
- // ttsRaw.Status = -1
- // //更新错误状态和msg
- // ttsRaw.Remark = msg
- //}
- fmt.Println(ttsRaw)
- db.Save(&ttsRaw)
- }
-
- db.Close()
- },
- }
- return cmd
- }
-
- func SyncTtsOss(text string, speed float64, pitch float64) (bool, string) {
-
- text, _ = url.QueryUnescape(text)
-
- text = strings.Trim(text , "")
-
- if text == "" {
- return false, "Error: text null"
- }
-
- typeMap := make(map[int]langType)
- typeMap[1] = langType{"en-US","en-US-Wavenet-B"}
- typeMap[2] = langType{"en-US","en-US-Wavenet-C"}
- typeMap[3] = langType{"en-GB","en-GB-Wavenet-B"}
- typeMap[4] = langType{"en-GB","en-GB-Wavenet-C"}
-
- for _, lang := range typeMap {
-
- ossObjectKey := service.GetTtsOssKey(text, lang.voiceName, lang.languageCode, speed, pitch)
-
- textKey := cache.GetTextKey(ossObjectKey)
-
- AudioContent, err := service.TextToSpeech(text, lang.voiceName, lang.languageCode, speed, pitch)
- if err != nil {
-
- return false, "TextToSpeech Error:" + err.Error()
- }
-
- uploadResult, err := service.UploadHkOss(ossObjectKey, AudioContent)
-
- if uploadResult == true {
- uploadResult, err = service.UploadOss(ossObjectKey, AudioContent)
-
- if uploadResult == true {
-
- //hk&cn节点oss都同步成功, set db
- mysql.CreateCorpusTts(text, textKey, lang.languageCode, lang.voiceName, ossObjectKey, speed, pitch)
- }
- }
-
- if err != nil {
-
- return false, "UploadHkOss Error" + err.Error()
- }
- }
-
- return true, ""
- }
-
- func ReplaceTrim(str string) string {
-
- str = strings.Replace(str, "\n", "", -1)
- str = strings.Replace(str, "\t", "", -1)
-
- return strings.Trim(str, "")
- }
-
|