|
@@ -1,6 +1,8 @@
|
1
|
1
|
package job
|
2
|
2
|
|
3
|
3
|
import (
|
|
4
|
+ "crypto/md5"
|
|
5
|
+ "encoding/json"
|
4
|
6
|
"fmt"
|
5
|
7
|
"git.links123.net/Slate/CorpusAI/service"
|
6
|
8
|
"git.links123.net/Slate/CorpusAI/service/store/cache"
|
|
@@ -10,9 +12,15 @@ import (
|
10
|
12
|
_ "github.com/go-sql-driver/mysql"
|
11
|
13
|
"github.com/spf13/cobra"
|
12
|
14
|
"github.com/gocolly/colly"
|
|
15
|
+ "io/ioutil"
|
|
16
|
+ "log"
|
|
17
|
+ "math/rand"
|
|
18
|
+ "net/http"
|
13
|
19
|
"net/url"
|
|
20
|
+ "strconv"
|
14
|
21
|
"strings"
|
15
|
22
|
"html"
|
|
23
|
+ "time"
|
16
|
24
|
)
|
17
|
25
|
|
18
|
26
|
type TtsRaw struct {
|
|
@@ -40,6 +48,14 @@ type Phrase struct {
|
40
|
48
|
Word string
|
41
|
49
|
}
|
42
|
50
|
|
|
51
|
+type Phonetic struct {
|
|
52
|
+ ID int64
|
|
53
|
+ Word string //翻译的文本
|
|
54
|
+ UkPhonetic string //唯一键值
|
|
55
|
+ Status int64
|
|
56
|
+ UsPhonetic string
|
|
57
|
+}
|
|
58
|
+
|
43
|
59
|
func (Phrase) TableName() string {
|
44
|
60
|
return "lnk_corpus_phrase_spider"
|
45
|
61
|
}
|
|
@@ -57,7 +73,7 @@ func RunCommand() *cobra.Command {
|
57
|
73
|
dbConfig := config.C.DB
|
58
|
74
|
settings := dbConfig.User+":"+dbConfig.Password+"@tcp("+dbConfig.Host+")/"+dbConfig.Name+"?charset=utf8&parseTime=True&loc=Local"
|
59
|
75
|
|
60
|
|
- var ttsRawList []TtsRaw
|
|
76
|
+ var phoneticList []Phonetic
|
61
|
77
|
|
62
|
78
|
db, err := gorm.Open("mysql", settings)
|
63
|
79
|
if err != nil {
|
|
@@ -66,80 +82,106 @@ func RunCommand() *cobra.Command {
|
66
|
82
|
|
67
|
83
|
db.Where("id <= ? and id >= ?", maxId, minId).Find(&ttsRawList)
|
68
|
84
|
|
69
|
|
- for _, ttsRaw := range ttsRawList {
|
70
|
|
-
|
71
|
|
- word := ttsRaw.Text
|
72
|
|
- c := colly.NewCollector(
|
73
|
|
- // Visit only domains: hackerspaces.org, wiki.hackerspaces.org
|
74
|
|
- colly.AllowedDomains("dict.cn", "m.dict.cn"),
|
75
|
|
- )
|
76
|
|
-
|
77
|
|
- // 例句
|
78
|
|
- c.OnHTML("div[class=\"layout sort\"]", func(e *colly.HTMLElement) {
|
79
|
|
- e.ForEach("ol", func(_ int, eol *colly.HTMLElement) {
|
80
|
|
- eol.ForEach("li", func(_ int, el *colly.HTMLElement) {
|
81
|
|
- //fmt.Println(el.DOM.Html())
|
82
|
|
- liText, _ := el.DOM.Html()
|
83
|
|
- liData := strings.Split(html.UnescapeString(liText), "<br/>")
|
84
|
|
- example := ReplaceTrim(liData[0])
|
85
|
|
- exampleParaphrase := ReplaceTrim(liData[1])
|
86
|
|
-
|
87
|
|
- examplePhrase := Phrase{Text:example, Paraphrase:exampleParaphrase, Type: 0, Word:word}
|
88
|
|
- fmt.Println(examplePhrase)
|
89
|
|
- db.Create(&examplePhrase)
|
90
|
|
- })
|
91
|
|
- })
|
92
|
|
- })
|
93
|
|
-
|
94
|
|
- // 词汇搭配, 短语
|
95
|
|
- c.OnHTML("div[class=\"layout coll\"]", func(e *colly.HTMLElement) {
|
96
|
|
-
|
97
|
|
- e.ForEach("li", func(_ int, el *colly.HTMLElement) {
|
98
|
|
- if el.ChildAttr("a", "href") != "" {
|
99
|
|
- phrase := ReplaceTrim(el.ChildText("a"))
|
100
|
|
- paraphrase := ReplaceTrim(el.Text)
|
101
|
|
- paraphrase = strings.Replace(paraphrase, phrase, "", 1)
|
102
|
|
-
|
103
|
|
- newPhrase := Phrase{Text:phrase, Paraphrase:paraphrase, Type: 1, Word:word}
|
104
|
|
- fmt.Println(newPhrase)
|
105
|
|
- db.Create(&newPhrase)
|
106
|
|
- }
|
107
|
|
- })
|
108
|
|
- })
|
109
|
|
-
|
110
|
|
- c.OnHTML("div[class=\"layout anno\"]", func(e *colly.HTMLElement) {
|
111
|
|
-
|
112
|
|
- e.ForEach("li", func(_ int, el *colly.HTMLElement) {
|
113
|
|
- if el.ChildAttr("a", "href") != "" {
|
114
|
|
- phrase := ReplaceTrim(el.ChildText("a"))
|
115
|
|
- paraphrase := ReplaceTrim(el.Text)
|
116
|
|
- paraphrase = strings.Replace(paraphrase, phrase, "", 1)
|
117
|
|
-
|
118
|
|
- newPhrase := Phrase{Text:phrase, Paraphrase:paraphrase, Type: 1, Word:word}
|
119
|
|
- fmt.Println(newPhrase)
|
120
|
|
- db.Create(&newPhrase)
|
121
|
|
- }
|
122
|
|
- })
|
123
|
|
- })
|
124
|
|
- c.OnRequest(func(r *colly.Request) {
|
125
|
|
- fmt.Println("Visiting", r.URL.String())
|
126
|
|
- })
|
127
|
|
-
|
128
|
|
- c.Visit("http://dict.cn/"+word)
|
|
85
|
+ for _, ttsRaw := range phoneticList {
|
|
86
|
+
|
|
87
|
+ word := ttsRaw.Word
|
129
|
88
|
|
|
89
|
+ Phonetic := transApi(word, "", "")
|
130
|
90
|
//if result == true {
|
131
|
91
|
ttsRaw.Status = 1
|
132
|
|
- //更新成功
|
133
|
|
- //} else {
|
134
|
|
- // ttsRaw.Status = -1
|
135
|
|
- // //更新错误状态和msg
|
136
|
|
- // ttsRaw.Remark = msg
|
137
|
|
- //}
|
|
92
|
+ ttsRaw.UkPhonetic = Phonetic["uk_phonetic"]
|
|
93
|
+ ttsRaw.UsPhonetic = Phonetic["us_phonetic"]
|
|
94
|
+
|
138
|
95
|
fmt.Println(ttsRaw)
|
139
|
96
|
db.Save(&ttsRaw)
|
140
|
97
|
}
|
141
|
98
|
|
142
|
99
|
db.Close()
|
|
100
|
+
|
|
101
|
+ //return
|
|
102
|
+ //var ttsRawList []TtsRaw
|
|
103
|
+ //
|
|
104
|
+ //db, err := gorm.Open("mysql", settings)
|
|
105
|
+ //if err != nil {
|
|
106
|
+ // panic("failed to connect database")
|
|
107
|
+ //}
|
|
108
|
+ //
|
|
109
|
+ //db.Where("id <= ? and id >= ?", maxId, minId).Find(&ttsRawList)
|
|
110
|
+ //
|
|
111
|
+ //for _, ttsRaw := range ttsRawList {
|
|
112
|
+ //
|
|
113
|
+ // word := ttsRaw.Text
|
|
114
|
+ // c := colly.NewCollector(
|
|
115
|
+ // // Visit only domains: hackerspaces.org, wiki.hackerspaces.org
|
|
116
|
+ // colly.AllowedDomains("dict.cn", "m.dict.cn"),
|
|
117
|
+ // )
|
|
118
|
+ //
|
|
119
|
+ // // 例句
|
|
120
|
+ // c.OnHTML("div[class=\"layout sort\"]", func(e *colly.HTMLElement) {
|
|
121
|
+ // e.ForEach("ol", func(_ int, eol *colly.HTMLElement) {
|
|
122
|
+ // eol.ForEach("li", func(_ int, el *colly.HTMLElement) {
|
|
123
|
+ // //fmt.Println(el.DOM.Html())
|
|
124
|
+ // liText, _ := el.DOM.Html()
|
|
125
|
+ // liData := strings.Split(html.UnescapeString(liText), "<br/>")
|
|
126
|
+ // example := ReplaceTrim(liData[0])
|
|
127
|
+ // exampleParaphrase := ReplaceTrim(liData[1])
|
|
128
|
+ //
|
|
129
|
+ // examplePhrase := Phrase{Text:example, Paraphrase:exampleParaphrase, Type: 0, Word:word}
|
|
130
|
+ // fmt.Println(examplePhrase)
|
|
131
|
+ // db.Create(&examplePhrase)
|
|
132
|
+ // })
|
|
133
|
+ // })
|
|
134
|
+ // })
|
|
135
|
+ //
|
|
136
|
+ // // 词汇搭配, 短语
|
|
137
|
+ // c.OnHTML("div[class=\"layout coll\"]", func(e *colly.HTMLElement) {
|
|
138
|
+ //
|
|
139
|
+ // e.ForEach("li", func(_ int, el *colly.HTMLElement) {
|
|
140
|
+ // if el.ChildAttr("a", "href") != "" {
|
|
141
|
+ // phrase := ReplaceTrim(el.ChildText("a"))
|
|
142
|
+ // paraphrase := ReplaceTrim(el.Text)
|
|
143
|
+ // paraphrase = strings.Replace(paraphrase, phrase, "", 1)
|
|
144
|
+ //
|
|
145
|
+ // newPhrase := Phrase{Text:phrase, Paraphrase:paraphrase, Type: 1, Word:word}
|
|
146
|
+ // fmt.Println(newPhrase)
|
|
147
|
+ // db.Create(&newPhrase)
|
|
148
|
+ // }
|
|
149
|
+ // })
|
|
150
|
+ // })
|
|
151
|
+ //
|
|
152
|
+ // c.OnHTML("div[class=\"layout anno\"]", func(e *colly.HTMLElement) {
|
|
153
|
+ //
|
|
154
|
+ // e.ForEach("li", func(_ int, el *colly.HTMLElement) {
|
|
155
|
+ // if el.ChildAttr("a", "href") != "" {
|
|
156
|
+ // phrase := ReplaceTrim(el.ChildText("a"))
|
|
157
|
+ // paraphrase := ReplaceTrim(el.Text)
|
|
158
|
+ // paraphrase = strings.Replace(paraphrase, phrase, "", 1)
|
|
159
|
+ //
|
|
160
|
+ // newPhrase := Phrase{Text:phrase, Paraphrase:paraphrase, Type: 1, Word:word}
|
|
161
|
+ // fmt.Println(newPhrase)
|
|
162
|
+ // db.Create(&newPhrase)
|
|
163
|
+ // }
|
|
164
|
+ // })
|
|
165
|
+ // })
|
|
166
|
+ // c.OnRequest(func(r *colly.Request) {
|
|
167
|
+ // fmt.Println("Visiting", r.URL.String())
|
|
168
|
+ // })
|
|
169
|
+ //
|
|
170
|
+ // c.Visit("http://dict.cn/"+word)
|
|
171
|
+ //
|
|
172
|
+ // //if result == true {
|
|
173
|
+ // ttsRaw.Status = 1
|
|
174
|
+ // //更新成功
|
|
175
|
+ // //} else {
|
|
176
|
+ // // ttsRaw.Status = -1
|
|
177
|
+ // // //更新错误状态和msg
|
|
178
|
+ // // ttsRaw.Remark = msg
|
|
179
|
+ // //}
|
|
180
|
+ // fmt.Println(ttsRaw)
|
|
181
|
+ // db.Save(&ttsRaw)
|
|
182
|
+ //}
|
|
183
|
+
|
|
184
|
+ db.Close()
|
143
|
185
|
},
|
144
|
186
|
}
|
145
|
187
|
return cmd
|
|
@@ -202,3 +244,75 @@ func ReplaceTrim(str string) string {
|
202
|
244
|
return strings.Trim(str, "")
|
203
|
245
|
}
|
204
|
246
|
|
|
247
|
+
|
|
248
|
+func transApi(text, from, to string) map[string]string {
|
|
249
|
+
|
|
250
|
+ apiUrl := "http://openapi.youdao.com/api"
|
|
251
|
+ appKey := "629a1435e6d2a894"
|
|
252
|
+ secKey := "DAgjDJfE0xPdZtMVhl1YUFUIrZc2DVHd"
|
|
253
|
+
|
|
254
|
+ basicMap := make (map[string]string)
|
|
255
|
+
|
|
256
|
+ if from == "" {
|
|
257
|
+ from = "auto"
|
|
258
|
+ }
|
|
259
|
+
|
|
260
|
+ if to == "" {
|
|
261
|
+ to = "auto"
|
|
262
|
+ }
|
|
263
|
+
|
|
264
|
+ rand.Seed(time.Now().Unix())
|
|
265
|
+ salt := strconv.Itoa(rand.Int())
|
|
266
|
+
|
|
267
|
+ sign := buildSign(appKey, text, salt, secKey)
|
|
268
|
+
|
|
269
|
+ request, err := http.NewRequest("GET", apiUrl, nil)
|
|
270
|
+ if err != nil {
|
|
271
|
+ log.Print(err)
|
|
272
|
+
|
|
273
|
+ return basicMap
|
|
274
|
+ }
|
|
275
|
+
|
|
276
|
+ query := request.URL.Query()
|
|
277
|
+ query.Add("q", text)
|
|
278
|
+ query.Add("from", from)
|
|
279
|
+ query.Add("to", to)
|
|
280
|
+ query.Add("appKey", appKey)
|
|
281
|
+ query.Add("salt", salt)
|
|
282
|
+ query.Add("sign", sign)
|
|
283
|
+
|
|
284
|
+ request.URL.RawQuery = query.Encode()
|
|
285
|
+
|
|
286
|
+ //fmt.Println(request.URL.String())
|
|
287
|
+
|
|
288
|
+ var resp *http.Response
|
|
289
|
+ resp, err = http.DefaultClient.Do(request)
|
|
290
|
+ if err != nil {
|
|
291
|
+ log.Print(err)
|
|
292
|
+
|
|
293
|
+ return basicMap
|
|
294
|
+ }
|
|
295
|
+
|
|
296
|
+ defer resp.Body.Close()
|
|
297
|
+
|
|
298
|
+ body, _ := ioutil.ReadAll(resp.Body)
|
|
299
|
+
|
|
300
|
+ result := map[string]interface{}{}
|
|
301
|
+ json.Unmarshal(body, &result)
|
|
302
|
+
|
|
303
|
+ basic := result["basic"].(map[string]string)
|
|
304
|
+ //fmt.Println(basic["phonetic"],basic["uk-phonetic"],basic["us-phonetic"])
|
|
305
|
+
|
|
306
|
+ basicMap["uk-phonetic"] = basic["uk-phonetic"]
|
|
307
|
+ basicMap["us-phonetic"] = basic["us-phonetic"]
|
|
308
|
+
|
|
309
|
+ return basicMap
|
|
310
|
+}
|
|
311
|
+
|
|
312
|
+func buildSign(appKey, text, salt, secKey string) string {
|
|
313
|
+
|
|
314
|
+ data := []byte(appKey + text + salt + secKey)
|
|
315
|
+ hash := md5.Sum(data)
|
|
316
|
+
|
|
317
|
+ return fmt.Sprintf("%x", hash)
|
|
318
|
+}
|