|
@@ -11,8 +11,8 @@ import (
|
11
|
11
|
"github.com/spf13/cobra"
|
12
|
12
|
"github.com/gocolly/colly"
|
13
|
13
|
"net/url"
|
14
|
|
- "strconv"
|
15
|
14
|
"strings"
|
|
15
|
+ "html"
|
16
|
16
|
)
|
17
|
17
|
|
18
|
18
|
type TtsRaw struct {
|
|
@@ -32,6 +32,18 @@ type langType struct {
|
32
|
32
|
voiceName string;
|
33
|
33
|
}
|
34
|
34
|
|
|
35
|
+
|
|
36
|
+type Phrase struct {
|
|
37
|
+ Text string
|
|
38
|
+ Paraphrase string
|
|
39
|
+ Type int64
|
|
40
|
+ Word string
|
|
41
|
+}
|
|
42
|
+
|
|
43
|
+func (Phrase) TableName() string {
|
|
44
|
+ return "lnk_corpus_phrase_spider"
|
|
45
|
+}
|
|
46
|
+
|
35
|
47
|
func RunCommand() *cobra.Command {
|
36
|
48
|
cmd := &cobra.Command{
|
37
|
49
|
Use: "job",
|
|
@@ -52,23 +64,77 @@ func RunCommand() *cobra.Command {
|
52
|
64
|
panic("failed to connect database")
|
53
|
65
|
}
|
54
|
66
|
|
55
|
|
- speed, _ := strconv.ParseFloat( "1.00", 64)
|
56
|
|
- pitch, _ := strconv.ParseFloat( "0.00", 64)
|
57
|
|
-
|
58
|
67
|
db.Where("id <= ? and id >= ?", maxId, minId).Find(&ttsRawList)
|
59
|
68
|
|
60
|
69
|
for _, ttsRaw := range ttsRawList {
|
61
|
70
|
|
62
|
|
- result, msg := SyncTtsOss(ttsRaw.Text, speed, pitch)
|
63
|
|
-
|
64
|
|
- if result == true {
|
65
|
|
- ttsRaw.Status = 1
|
|
71
|
+ word := ttsRaw.Text
|
|
72
|
+ c := colly.NewCollector(
|
|
73
|
+ // Visit only domains: hackerspaces.org, wiki.hackerspaces.org
|
|
74
|
+ colly.AllowedDomains("dict.cn", "m.dict.cn"),
|
|
75
|
+ )
|
|
76
|
+
|
|
77
|
+ // 例句
|
|
78
|
+ c.OnHTML("div[class=\"layout sort\"]", func(e *colly.HTMLElement) {
|
|
79
|
+ e.ForEach("ol", func(_ int, eol *colly.HTMLElement) {
|
|
80
|
+ eol.ForEach("li", func(_ int, el *colly.HTMLElement) {
|
|
81
|
+ //fmt.Println(el.DOM.Html())
|
|
82
|
+ liText, _ := el.DOM.Html()
|
|
83
|
+ liData := strings.Split(html.UnescapeString(liText), "<br/>")
|
|
84
|
+ example := ReplaceTrim(liData[0])
|
|
85
|
+ exampleParaphrase := ReplaceTrim(liData[1])
|
|
86
|
+
|
|
87
|
+ examplePhrase := Phrase{Text:example, Paraphrase:exampleParaphrase, Type: 0, Word:word}
|
|
88
|
+ fmt.Println(examplePhrase)
|
|
89
|
+ db.Create(&examplePhrase)
|
|
90
|
+ })
|
|
91
|
+ })
|
|
92
|
+ })
|
|
93
|
+
|
|
94
|
+ // 词汇搭配, 短语
|
|
95
|
+ c.OnHTML("div[class=\"layout coll\"]", func(e *colly.HTMLElement) {
|
|
96
|
+
|
|
97
|
+ e.ForEach("li", func(_ int, el *colly.HTMLElement) {
|
|
98
|
+ if el.ChildAttr("a", "href") != "" {
|
|
99
|
+ phrase := ReplaceTrim(el.ChildText("a"))
|
|
100
|
+ paraphrase := ReplaceTrim(el.Text)
|
|
101
|
+ paraphrase = strings.Replace(paraphrase, phrase, "", 1)
|
|
102
|
+
|
|
103
|
+ newPhrase := Phrase{Text:phrase, Paraphrase:paraphrase, Type: 1, Word:word}
|
|
104
|
+ fmt.Println(newPhrase)
|
|
105
|
+ db.Create(&newPhrase)
|
|
106
|
+ }
|
|
107
|
+ })
|
|
108
|
+ })
|
|
109
|
+
|
|
110
|
+ c.OnHTML("div[class=\"layout anno\"]", func(e *colly.HTMLElement) {
|
|
111
|
+
|
|
112
|
+ e.ForEach("li", func(_ int, el *colly.HTMLElement) {
|
|
113
|
+ if el.ChildAttr("a", "href") != "" {
|
|
114
|
+ phrase := ReplaceTrim(el.ChildText("a"))
|
|
115
|
+ paraphrase := ReplaceTrim(el.Text)
|
|
116
|
+ paraphrase = strings.Replace(paraphrase, phrase, "", 1)
|
|
117
|
+
|
|
118
|
+ newPhrase := Phrase{Text:phrase, Paraphrase:paraphrase, Type: 1, Word:word}
|
|
119
|
+ fmt.Println(newPhrase)
|
|
120
|
+ db.Create(&newPhrase)
|
|
121
|
+ }
|
|
122
|
+ })
|
|
123
|
+ })
|
|
124
|
+ c.OnRequest(func(r *colly.Request) {
|
|
125
|
+ fmt.Println("Visiting", r.URL.String())
|
|
126
|
+ })
|
|
127
|
+
|
|
128
|
+ c.Visit("http://dict.cn/"+word)
|
|
129
|
+
|
|
130
|
+ //if result == true {
|
|
131
|
+ ttsRaw.Status = 1
|
66
|
132
|
//更新成功
|
67
|
|
- } else {
|
68
|
|
- ttsRaw.Status = -1
|
69
|
|
- //更新错误状态和msg
|
70
|
|
- ttsRaw.Remark = msg
|
71
|
|
- }
|
|
133
|
+ //} else {
|
|
134
|
+ // ttsRaw.Status = -1
|
|
135
|
+ // //更新错误状态和msg
|
|
136
|
+ // ttsRaw.Remark = msg
|
|
137
|
+ //}
|
72
|
138
|
fmt.Println(ttsRaw)
|
73
|
139
|
db.Save(&ttsRaw)
|
74
|
140
|
}
|
|
@@ -128,49 +194,11 @@ func SyncTtsOss(text string, speed float64, pitch float64) (bool, string) {
|
128
|
194
|
return true, ""
|
129
|
195
|
}
|
130
|
196
|
|
131
|
|
-func test() {
|
132
|
|
-
|
133
|
|
- // Instantiate default collector
|
134
|
|
- c := colly.NewCollector(
|
135
|
|
- // Visit only domains: hackerspaces.org, wiki.hackerspaces.org
|
136
|
|
- colly.AllowedDomains("dict.cn", "m.dict.cn"),
|
137
|
|
- )
|
138
|
|
-
|
139
|
|
- // On every a element which has href attribute call callback
|
140
|
|
- c.OnHTML("div[class=sent]", func(e *colly.HTMLElement) {
|
141
|
|
- //link := e.Attr("href")
|
142
|
|
- // Print link
|
143
|
|
- fmt.Printf("Link found: %q -> %s\n", e.Text)
|
144
|
|
- // Visit link found on page
|
145
|
|
- // Only those links are visited which are in AllowedDomains
|
146
|
|
- //c.Visit(e.Request.AbsoluteURL(link))
|
147
|
|
- })
|
148
|
|
-
|
149
|
|
- /**
|
150
|
|
- 1. div[class=sent]
|
151
|
|
- 2. 例句 div[class=layout sort]
|
152
|
|
- 3. 词法用法 div[class=section learn]
|
153
|
|
- 4.
|
154
|
|
- */
|
155
|
|
-
|
156
|
|
- // Before making a request print "Visiting ..."
|
157
|
|
- c.OnRequest(func(r *colly.Request) {
|
158
|
|
- fmt.Println("Visiting", r.URL.String())
|
159
|
|
- })
|
160
|
|
-
|
161
|
|
- // Start scraping on https://hackerspaces.org
|
162
|
|
- c.Visit("http://dict.cn/about")
|
163
|
|
-}
|
|
197
|
+func ReplaceTrim(str string) string {
|
164
|
198
|
|
165
|
|
-func newTestCmd() *cobra.Command {
|
166
|
|
- cmd := &cobra.Command{
|
167
|
|
- Use: "test",
|
168
|
|
- Short: "Run the test service",
|
169
|
|
- Run: func(cmd *cobra.Command, args []string) {
|
170
|
|
- //fmt.Println("Echo: " + strings.Join(args, " "))
|
|
199
|
+ str = strings.Replace(str, "\n", "", -1)
|
|
200
|
+ str = strings.Replace(str, "\t", "", -1)
|
|
201
|
+
|
|
202
|
+ return strings.Trim(str, "")
|
|
203
|
+}
|
171
|
204
|
|
172
|
|
- test()
|
173
|
|
- },
|
174
|
|
- }
|
175
|
|
- return cmd
|
176
|
|
-}
|