chenyuanyang 5 years ago
parent
commit
e8855bd923
1 changed files with 181 additions and 67 deletions
  1. 181
    67
      cmd/job/job.go

+ 181
- 67
cmd/job/job.go View File

@@ -1,6 +1,8 @@
1 1
 package job
2 2
 
3 3
 import (
4
+	"crypto/md5"
5
+	"encoding/json"
4 6
 	"fmt"
5 7
 	"git.links123.net/Slate/CorpusAI/service"
6 8
 	"git.links123.net/Slate/CorpusAI/service/store/cache"
@@ -10,9 +12,15 @@ import (
10 12
 	_ "github.com/go-sql-driver/mysql"
11 13
 	"github.com/spf13/cobra"
12 14
 	"github.com/gocolly/colly"
15
+	"io/ioutil"
16
+	"log"
17
+	"math/rand"
18
+	"net/http"
13 19
 	"net/url"
20
+	"strconv"
14 21
 	"strings"
15 22
 	"html"
23
+	"time"
16 24
 )
17 25
 
18 26
 type TtsRaw struct {
@@ -40,6 +48,14 @@ type Phrase struct {
40 48
 	Word			string
41 49
 }
42 50
 
51
+type Phonetic struct {
52
+	ID                     int64
53
+	Word                   string		//翻译的文本
54
+	UkPhonetic                string 	//唯一键值
55
+	Status				   int64
56
+	UsPhonetic                 string
57
+}
58
+
43 59
 func (Phrase) TableName() string {
44 60
 	return "lnk_corpus_phrase_spider"
45 61
 }
@@ -57,7 +73,7 @@ func RunCommand() *cobra.Command {
57 73
 			dbConfig := config.C.DB
58 74
 			settings := dbConfig.User+":"+dbConfig.Password+"@tcp("+dbConfig.Host+")/"+dbConfig.Name+"?charset=utf8&parseTime=True&loc=Local"
59 75
 
60
-			var ttsRawList []TtsRaw
76
+			var phoneticList []Phonetic
61 77
 
62 78
 			db, err := gorm.Open("mysql", settings)
63 79
 			if err != nil {
@@ -66,80 +82,106 @@ func RunCommand() *cobra.Command {
66 82
 
67 83
 			db.Where("id <= ? and id >= ?", maxId, minId).Find(&ttsRawList)
68 84
 
69
-			for _, ttsRaw := range ttsRawList {
70
-
71
-				word := ttsRaw.Text
72
-				c := colly.NewCollector(
73
-					// Visit only domains: hackerspaces.org, wiki.hackerspaces.org
74
-					colly.AllowedDomains("dict.cn", "m.dict.cn"),
75
-				)
76
-
77
-				// 例句
78
-				c.OnHTML("div[class=\"layout sort\"]", func(e *colly.HTMLElement) {
79
-					e.ForEach("ol", func(_ int, eol *colly.HTMLElement) {
80
-						eol.ForEach("li", func(_ int, el *colly.HTMLElement) {
81
-							//fmt.Println(el.DOM.Html())
82
-							liText, _ := el.DOM.Html()
83
-							liData := strings.Split(html.UnescapeString(liText), "<br/>")
84
-							example := ReplaceTrim(liData[0])
85
-							exampleParaphrase := ReplaceTrim(liData[1])
86
-
87
-							examplePhrase := Phrase{Text:example, Paraphrase:exampleParaphrase, Type: 0, Word:word}
88
-							fmt.Println(examplePhrase)
89
-							db.Create(&examplePhrase)
90
-						})
91
-					})
92
-				})
93
-
94
-				// 词汇搭配, 短语
95
-				c.OnHTML("div[class=\"layout coll\"]", func(e *colly.HTMLElement) {
96
-
97
-					e.ForEach("li", func(_ int, el *colly.HTMLElement) {
98
-						if el.ChildAttr("a", "href") != "" {
99
-							phrase := ReplaceTrim(el.ChildText("a"))
100
-							paraphrase := ReplaceTrim(el.Text)
101
-							paraphrase = strings.Replace(paraphrase, phrase, "", 1)
102
-
103
-							newPhrase := Phrase{Text:phrase, Paraphrase:paraphrase, Type: 1, Word:word}
104
-							fmt.Println(newPhrase)
105
-							db.Create(&newPhrase)
106
-						}
107
-					})
108
-				})
109
-
110
-				c.OnHTML("div[class=\"layout anno\"]", func(e *colly.HTMLElement) {
111
-
112
-					e.ForEach("li", func(_ int, el *colly.HTMLElement) {
113
-						if el.ChildAttr("a", "href") != "" {
114
-							phrase := ReplaceTrim(el.ChildText("a"))
115
-							paraphrase := ReplaceTrim(el.Text)
116
-							paraphrase = strings.Replace(paraphrase, phrase, "", 1)
117
-
118
-							newPhrase := Phrase{Text:phrase, Paraphrase:paraphrase, Type: 1, Word:word}
119
-							fmt.Println(newPhrase)
120
-							db.Create(&newPhrase)
121
-						}
122
-					})
123
-				})
124
-				c.OnRequest(func(r *colly.Request) {
125
-					fmt.Println("Visiting", r.URL.String())
126
-				})
127
-
128
-				c.Visit("http://dict.cn/"+word)
85
+			for _, ttsRaw := range phoneticList {
86
+
87
+				word := ttsRaw.Word
129 88
 
89
+				Phonetic := transApi(word, "", "")
130 90
 				//if result == true {
131 91
 				ttsRaw.Status = 1
132
-					//更新成功
133
-				//} else {
134
-				//	ttsRaw.Status = -1
135
-				//	//更新错误状态和msg
136
-				//	ttsRaw.Remark = msg
137
-				//}
92
+				ttsRaw.UkPhonetic = Phonetic["uk_phonetic"]
93
+				ttsRaw.UsPhonetic = Phonetic["us_phonetic"]
94
+
138 95
 				fmt.Println(ttsRaw)
139 96
 				db.Save(&ttsRaw)
140 97
 			}
141 98
 
142 99
 			db.Close()
100
+
101
+			//return
102
+			//var ttsRawList []TtsRaw
103
+			//
104
+			//db, err := gorm.Open("mysql", settings)
105
+			//if err != nil {
106
+			//	panic("failed to connect database")
107
+			//}
108
+			//
109
+			//db.Where("id <= ? and id >= ?", maxId, minId).Find(&ttsRawList)
110
+			//
111
+			//for _, ttsRaw := range ttsRawList {
112
+			//
113
+			//	word := ttsRaw.Text
114
+			//	c := colly.NewCollector(
115
+			//		// Visit only domains: hackerspaces.org, wiki.hackerspaces.org
116
+			//		colly.AllowedDomains("dict.cn", "m.dict.cn"),
117
+			//	)
118
+			//
119
+			//	// 例句
120
+			//	c.OnHTML("div[class=\"layout sort\"]", func(e *colly.HTMLElement) {
121
+			//		e.ForEach("ol", func(_ int, eol *colly.HTMLElement) {
122
+			//			eol.ForEach("li", func(_ int, el *colly.HTMLElement) {
123
+			//				//fmt.Println(el.DOM.Html())
124
+			//				liText, _ := el.DOM.Html()
125
+			//				liData := strings.Split(html.UnescapeString(liText), "<br/>")
126
+			//				example := ReplaceTrim(liData[0])
127
+			//				exampleParaphrase := ReplaceTrim(liData[1])
128
+			//
129
+			//				examplePhrase := Phrase{Text:example, Paraphrase:exampleParaphrase, Type: 0, Word:word}
130
+			//				fmt.Println(examplePhrase)
131
+			//				db.Create(&examplePhrase)
132
+			//			})
133
+			//		})
134
+			//	})
135
+			//
136
+			//	// 词汇搭配, 短语
137
+			//	c.OnHTML("div[class=\"layout coll\"]", func(e *colly.HTMLElement) {
138
+			//
139
+			//		e.ForEach("li", func(_ int, el *colly.HTMLElement) {
140
+			//			if el.ChildAttr("a", "href") != "" {
141
+			//				phrase := ReplaceTrim(el.ChildText("a"))
142
+			//				paraphrase := ReplaceTrim(el.Text)
143
+			//				paraphrase = strings.Replace(paraphrase, phrase, "", 1)
144
+			//
145
+			//				newPhrase := Phrase{Text:phrase, Paraphrase:paraphrase, Type: 1, Word:word}
146
+			//				fmt.Println(newPhrase)
147
+			//				db.Create(&newPhrase)
148
+			//			}
149
+			//		})
150
+			//	})
151
+			//
152
+			//	c.OnHTML("div[class=\"layout anno\"]", func(e *colly.HTMLElement) {
153
+			//
154
+			//		e.ForEach("li", func(_ int, el *colly.HTMLElement) {
155
+			//			if el.ChildAttr("a", "href") != "" {
156
+			//				phrase := ReplaceTrim(el.ChildText("a"))
157
+			//				paraphrase := ReplaceTrim(el.Text)
158
+			//				paraphrase = strings.Replace(paraphrase, phrase, "", 1)
159
+			//
160
+			//				newPhrase := Phrase{Text:phrase, Paraphrase:paraphrase, Type: 1, Word:word}
161
+			//				fmt.Println(newPhrase)
162
+			//				db.Create(&newPhrase)
163
+			//			}
164
+			//		})
165
+			//	})
166
+			//	c.OnRequest(func(r *colly.Request) {
167
+			//		fmt.Println("Visiting", r.URL.String())
168
+			//	})
169
+			//
170
+			//	c.Visit("http://dict.cn/"+word)
171
+			//
172
+			//	//if result == true {
173
+			//	ttsRaw.Status = 1
174
+			//		//更新成功
175
+			//	//} else {
176
+			//	//	ttsRaw.Status = -1
177
+			//	//	//更新错误状态和msg
178
+			//	//	ttsRaw.Remark = msg
179
+			//	//}
180
+			//	fmt.Println(ttsRaw)
181
+			//	db.Save(&ttsRaw)
182
+			//}
183
+
184
+			db.Close()
143 185
 		},
144 186
 	}
145 187
 	return cmd
@@ -202,3 +244,75 @@ func ReplaceTrim(str string) string {
202 244
 	return strings.Trim(str, "")
203 245
 }
204 246
 
247
+
248
+func transApi(text, from, to string) map[string]string {
249
+
250
+	apiUrl := "http://openapi.youdao.com/api"
251
+	appKey := "629a1435e6d2a894"
252
+	secKey := "DAgjDJfE0xPdZtMVhl1YUFUIrZc2DVHd"
253
+
254
+	basicMap := make (map[string]string)
255
+
256
+	if from == "" {
257
+		from = "auto"
258
+	}
259
+
260
+	if to == "" {
261
+		to = "auto"
262
+	}
263
+
264
+	rand.Seed(time.Now().Unix())
265
+	salt := strconv.Itoa(rand.Int())
266
+
267
+	sign := buildSign(appKey, text, salt, secKey)
268
+
269
+	request, err := http.NewRequest("GET", apiUrl, nil)
270
+	if err != nil {
271
+		log.Print(err)
272
+
273
+		return basicMap
274
+	}
275
+
276
+	query := request.URL.Query()
277
+	query.Add("q", text)
278
+	query.Add("from", from)
279
+	query.Add("to", to)
280
+	query.Add("appKey", appKey)
281
+	query.Add("salt", salt)
282
+	query.Add("sign", sign)
283
+
284
+	request.URL.RawQuery = query.Encode()
285
+
286
+	//fmt.Println(request.URL.String())
287
+
288
+	var resp *http.Response
289
+	resp, err = http.DefaultClient.Do(request)
290
+	if err != nil {
291
+		log.Print(err)
292
+
293
+		return basicMap
294
+	}
295
+
296
+	defer resp.Body.Close()
297
+
298
+	body, _ := ioutil.ReadAll(resp.Body)
299
+
300
+	result := map[string]interface{}{}
301
+	json.Unmarshal(body, &result)
302
+
303
+	basic := result["basic"].(map[string]string)
304
+	//fmt.Println(basic["phonetic"],basic["uk-phonetic"],basic["us-phonetic"])
305
+
306
+	basicMap["uk-phonetic"] = basic["uk-phonetic"]
307
+	basicMap["us-phonetic"] = basic["us-phonetic"]
308
+
309
+	return basicMap
310
+}
311
+
312
+func buildSign(appKey, text, salt, secKey string) string {
313
+
314
+	data := []byte(appKey + text + salt + secKey)
315
+	hash := md5.Sum(data)
316
+
317
+	return fmt.Sprintf("%x", hash)
318
+}