采集功能,增加自定义提取内容 · JavaCodeMood/BookStack@7d01614 · GitHub
Skip to content

Commit 7d01614

Browse files
committed
采集功能,增加自定义提取内容
1 parent e4f09ed commit 7d01614

4 files changed

Lines changed: 51 additions & 15 deletions

File tree

controllers/BaseController.go

Lines changed: 4 additions & 3 deletions

static/css/bookstack.css

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1182,4 +1182,4 @@ li.L1, li.L3, li.L5, li.L7, li.L9{background-color: transparent;}
11821182
.bookstack-bars li{border-bottom: 1px solid #efefef;}
11831183
.bookstack-bars li:last-child{border-bottom:0px;}
11841184
.bookstack-bars li:hover{background-color: #EFEFEF}
1185-
.bookstack-bars li:hover a{color: #10af88}
1185+
.bookstack-bars li:hover a{color: #10af88}

utils/util.go

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -129,8 +129,9 @@ func CrawlByChrome(urlstr string) (b []byte, err error) {
129129
//内容类型,contType:0表示markdown,1表示html,2表示文本
130130
//force:是否是强力采集
131131
//intelligence:是否是智能提取,智能提取,使用html2article,否则提取body
132-
func CrawlHtml2Markdown(urlstr string, contType int, force, intelligence bool, headers ...map[string]string) (cont string, err error) {
133-
if force {
132+
//diySelecter:自定义选择器
133+
func CrawlHtml2Markdown(urlstr string, contType int, force bool, intelligence int, diySelecter string, headers ...map[string]string) (cont string, err error) {
134+
if force { //强力模式
134135
var b []byte
135136
b, err = CrawlByChrome(urlstr)
136137
cont = string(b)
@@ -184,8 +185,11 @@ func CrawlHtml2Markdown(urlstr string, contType int, force, intelligence bool, h
184185
})
185186
}
186187

188+
diySelecter = strings.TrimSpace(diySelecter)
189+
187190
cont, err = doc.Html()
188-
if intelligence {
191+
192+
if intelligence == 1 { //智能提取
189193
ext, err := html2article.NewFromHtml(cont)
190194
if err != nil {
191195
return cont, err
@@ -196,11 +200,24 @@ func CrawlHtml2Markdown(urlstr string, contType int, force, intelligence bool, h
196200
}
197201
switch contType {
198202
case 1: //=>html
199-
cont = article.Html + "<br/><br/><br/>原文:" + urlstr
203+
cont = article.Html + "\n原文:\n> " + urlstr
200204
case 2: //=>text
201-
cont = article.Content + fmt.Sprintf("\n\r\n\r原文:%v", urlstr)
205+
cont = article.Content + fmt.Sprintf("\n原文:\n> %v", urlstr)
202206
default: //0 && other=>markdown
203-
cont = html2md.Convert(article.Html) + fmt.Sprintf("\n\r\n\r原文:[%v](%v)", urlstr, urlstr)
207+
cont = html2md.Convert(article.Html) + fmt.Sprintf("\n\r\n\r原文:\n> [%v](%v)", urlstr, urlstr)
208+
}
209+
} else if intelligence == 2 && diySelecter != "" { //自定义提取
210+
if htmlstr, err := doc.Find(diySelecter).Html(); err != nil {
211+
return "", err
212+
} else {
213+
switch contType {
214+
case 1: //=>html
215+
cont = htmlstr + "\n\n\n原文:\n> " + urlstr
216+
case 2: //=>text
217+
cont = doc.Find(diySelecter).Text() + fmt.Sprintf("\n\r\n\r原文:\n> %v", urlstr)
218+
default: //0 && other=>markdown
219+
cont = html2md.Convert(htmlstr) + fmt.Sprintf("\n\r\n\r原文:\n> [%v](%v)", urlstr, urlstr)
220+
}
204221
}
205222
} else {
206223
//移除body中的所有js标签
@@ -211,12 +228,12 @@ func CrawlHtml2Markdown(urlstr string, contType int, force, intelligence bool, h
211228
switch contType {
212229
case 1: //=>html
213230
htmlstr, _ := doc.Find("body").Html()
214-
cont = htmlstr + "<br/><br/><br/>原文:" + urlstr
231+
cont = htmlstr + "\n\n\n原文:\n> " + urlstr
215232
case 2: //=>text
216-
cont = doc.Find("body").Text() + fmt.Sprintf("\n\r\n\r原文:%v", urlstr)
233+
cont = doc.Find("body").Text() + fmt.Sprintf("\n\r\n\r原文:\n> %v", urlstr)
217234
default: //0 && other=>markdown
218235
htmlstr, _ := doc.Find("body").Html()
219-
cont = html2md.Convert(htmlstr) + fmt.Sprintf("\n\r\n\r原文:[%v](%v)", urlstr, urlstr)
236+
cont = html2md.Convert(htmlstr) + fmt.Sprintf("\n\r\n\r原文:\n> [%v](%v)", urlstr, urlstr)
220237
}
221238
}
222239

views/document/markdown_edit_template.html

Lines changed: 20 additions & 2 deletions

0 commit comments

Comments
 (0)