chenyuanyang 5 years ago
parent
commit
98523b46a8
3 changed files with 117 additions and 58 deletions
  1. 85
    57
      cmd/job/job.go
  2. 0
    1
      main.go
  3. 32
    0
      service/store/mysql/tts_raw.go

+ 85
- 57
cmd/job/job.go View File

@@ -11,8 +11,8 @@ import (
11 11
 	"github.com/spf13/cobra"
12 12
 	"github.com/gocolly/colly"
13 13
 	"net/url"
14
-	"strconv"
15 14
 	"strings"
15
+	"html"
16 16
 )
17 17
 
18 18
 type TtsRaw struct {
@@ -32,6 +32,18 @@ type langType struct {
32 32
 	voiceName string;
33 33
 }
34 34
 
35
+
36
+type Phrase struct {
37
+	Text   			string
38
+	Paraphrase    	string
39
+	Type 			int64
40
+	Word			string
41
+}
42
+
43
+func (Phrase) TableName() string {
44
+	return "lnk_corpus_phrase_spider"
45
+}
46
+
35 47
 func RunCommand() *cobra.Command {
36 48
 	cmd := &cobra.Command{
37 49
 		Use:   "job",
@@ -52,23 +64,77 @@ func RunCommand() *cobra.Command {
52 64
 				panic("failed to connect database")
53 65
 			}
54 66
 
55
-			speed, _ := strconv.ParseFloat( "1.00", 64)
56
-			pitch, _ := strconv.ParseFloat( "0.00", 64)
57
-
58 67
 			db.Where("id <= ? and id >= ?", maxId, minId).Find(&ttsRawList)
59 68
 
60 69
 			for _, ttsRaw := range ttsRawList {
61 70
 
62
-				result, msg := SyncTtsOss(ttsRaw.Text, speed, pitch)
63
-
64
-				if result == true {
65
-					ttsRaw.Status = 1
71
+				word := ttsRaw.Text
72
+				c := colly.NewCollector(
73
+					// Visit only domains: hackerspaces.org, wiki.hackerspaces.org
74
+					colly.AllowedDomains("dict.cn", "m.dict.cn"),
75
+				)
76
+
77
+				// 例句
78
+				c.OnHTML("div[class=\"layout sort\"]", func(e *colly.HTMLElement) {
79
+					e.ForEach("ol", func(_ int, eol *colly.HTMLElement) {
80
+						eol.ForEach("li", func(_ int, el *colly.HTMLElement) {
81
+							//fmt.Println(el.DOM.Html())
82
+							liText, _ := el.DOM.Html()
83
+							liData := strings.Split(html.UnescapeString(liText), "<br/>")
84
+							example := ReplaceTrim(liData[0])
85
+							exampleParaphrase := ReplaceTrim(liData[1])
86
+
87
+							examplePhrase := Phrase{Text:example, Paraphrase:exampleParaphrase, Type: 0, Word:word}
88
+							fmt.Println(examplePhrase)
89
+							db.Create(&examplePhrase)
90
+						})
91
+					})
92
+				})
93
+
94
+				// 词汇搭配, 短语
95
+				c.OnHTML("div[class=\"layout coll\"]", func(e *colly.HTMLElement) {
96
+
97
+					e.ForEach("li", func(_ int, el *colly.HTMLElement) {
98
+						if el.ChildAttr("a", "href") != "" {
99
+							phrase := ReplaceTrim(el.ChildText("a"))
100
+							paraphrase := ReplaceTrim(el.Text)
101
+							paraphrase = strings.Replace(paraphrase, phrase, "", 1)
102
+
103
+							newPhrase := Phrase{Text:phrase, Paraphrase:paraphrase, Type: 1, Word:word}
104
+							fmt.Println(newPhrase)
105
+							db.Create(&newPhrase)
106
+						}
107
+					})
108
+				})
109
+
110
+				c.OnHTML("div[class=\"layout anno\"]", func(e *colly.HTMLElement) {
111
+
112
+					e.ForEach("li", func(_ int, el *colly.HTMLElement) {
113
+						if el.ChildAttr("a", "href") != "" {
114
+							phrase := ReplaceTrim(el.ChildText("a"))
115
+							paraphrase := ReplaceTrim(el.Text)
116
+							paraphrase = strings.Replace(paraphrase, phrase, "", 1)
117
+
118
+							newPhrase := Phrase{Text:phrase, Paraphrase:paraphrase, Type: 1, Word:word}
119
+							fmt.Println(newPhrase)
120
+							db.Create(&newPhrase)
121
+						}
122
+					})
123
+				})
124
+				c.OnRequest(func(r *colly.Request) {
125
+					fmt.Println("Visiting", r.URL.String())
126
+				})
127
+
128
+				c.Visit("http://dict.cn/"+word)
129
+
130
+				//if result == true {
131
+				ttsRaw.Status = 1
66 132
 					//更新成功
67
-				} else {
68
-					ttsRaw.Status = -1
69
-					//更新错误状态和msg
70
-					ttsRaw.Remark = msg
71
-				}
133
+				//} else {
134
+				//	ttsRaw.Status = -1
135
+				//	//更新错误状态和msg
136
+				//	ttsRaw.Remark = msg
137
+				//}
72 138
 				fmt.Println(ttsRaw)
73 139
 				db.Save(&ttsRaw)
74 140
 			}
@@ -128,49 +194,11 @@ func SyncTtsOss(text string, speed float64, pitch float64) (bool, string) {
128 194
 	return true, ""
129 195
 }
130 196
 
131
-func test() {
132
-
133
-	// Instantiate default collector
134
-	c := colly.NewCollector(
135
-		// Visit only domains: hackerspaces.org, wiki.hackerspaces.org
136
-		colly.AllowedDomains("dict.cn", "m.dict.cn"),
137
-	)
138
-
139
-	// On every a element which has href attribute call callback
140
-	c.OnHTML("div[class=sent]", func(e *colly.HTMLElement) {
141
-		//link := e.Attr("href")
142
-		// Print link
143
-		fmt.Printf("Link found: %q -> %s\n", e.Text)
144
-		// Visit link found on page
145
-		// Only those links are visited which are in AllowedDomains
146
-		//c.Visit(e.Request.AbsoluteURL(link))
147
-	})
148
-
149
-	/**
150
-	1. div[class=sent]
151
-	2. 例句 div[class=layout sort]
152
-	3. 词法用法 div[class=section learn]
153
-	4.
154
-	 */
155
-
156
-	// Before making a request print "Visiting ..."
157
-	c.OnRequest(func(r *colly.Request) {
158
-		fmt.Println("Visiting", r.URL.String())
159
-	})
160
-
161
-	// Start scraping on https://hackerspaces.org
162
-	c.Visit("http://dict.cn/about")
163
-}
197
+func ReplaceTrim(str string) string {
164 198
 
165
-func newTestCmd() *cobra.Command {
166
-	cmd := &cobra.Command{
167
-		Use:   "test",
168
-		Short: "Run the test service",
169
-		Run: func(cmd *cobra.Command, args []string) {
170
-			//fmt.Println("Echo: " + strings.Join(args, " "))
199
+	str = strings.Replace(str, "\n", "", -1)
200
+	str = strings.Replace(str, "\t", "", -1)
201
+
202
+	return strings.Trim(str, "")
203
+}
171 204
 
172
-			test()
173
-		},
174
-	}
175
-	return cmd
176
-}

+ 0
- 1
main.go View File

@@ -29,7 +29,6 @@ func main() {
29 29
 	rootCmd.AddCommand(http.RunCommand())
30 30
 	rootCmd.AddCommand(version.RunCommand(apiVersion, gitCommit, built))
31 31
 	rootCmd.AddCommand(job.RunCommand())
32
-	rootCmd.AddCommand(job.newTestCmd())
33 32
 
34 33
 	if err := rootCmd.Execute(); err != nil {
35 34
 		panic(err)

+ 32
- 0
service/store/mysql/tts_raw.go View File

@@ -0,0 +1,32 @@
1
+package mysql
2
+
3
+import (
4
+	"fmt"
5
+)
6
+
7
+const ttsRawCollection = `lnk_corpus_tts_raw`
8
+type TtsRaw struct {
9
+	ID                     int64  `db:"id"`
10
+	Text                   string  `db:"text"`			//翻译的文本
11
+	UniqKey                string  `db:"uniq_key"`		//唯一键值
12
+	Status				   int64  `db:"status"`
13
+	Remark                 string  `db:"remark"`
14
+}
15
+
16
+func GetTtsRawList() ([]TtsRaw, error)  {
17
+	var ttsRawList []TtsRaw
18
+
19
+	rows := db_session.Collection(ttsRawCollection).Find()
20
+	//if err != nil {
21
+	//
22
+	//	fmt.Println(err)
23
+	//	return ttsRawList, err
24
+	//}
25
+
26
+	err := rows.All(&ttsRawList)
27
+	if err != nil {
28
+	}
29
+	fmt.Println(ttsRawList) // SELECT id, name FROM accounts
30
+
31
+	return ttsRawList, err
32
+}