@@ -117,6 +117,9 @@ func RenderDocumentById(id int) {
117117
118118//使用chrome采集网页HTML
119119func CrawlByChrome (urlstr string ) (b []byte , err error ) {
120+ if strings .Contains (urlstr , "bookstack" ) {
121+ return
122+ }
120123 chrome := beego .AppConfig .DefaultString ("chrome" , "chromium-browser" )
121124 args := []string {"--headless" , "--disable-gpu" , "--dump-dom" , "--no-sandbox" , urlstr }
122125 cmd := exec .Command (chrome , args ... )
@@ -129,7 +132,10 @@ func CrawlByChrome(urlstr string) (b []byte, err error) {
129132//intelligence:是否是智能提取,智能提取,使用html2article,否则提取body
130133//diySelecter:自定义选择器
131134//注意:由于参数问题,采集并下载图片的话,在headers中加上key为"project"的字段,值为文档项目的标识
132- func CrawlHtml2Markdown (urlstr string , contType int , force bool , intelligence int , diySelecter string , headers ... map [string ]string ) (cont string , err error ) {
135+ func CrawlHtml2Markdown (urlstr string , contType int , force bool , intelligence int , diySelector string , headers ... map [string ]string ) (cont string , err error ) {
136+ if strings .Contains (urlstr , "bookstack" ) {
137+ return
138+ }
133139 if force { //强力模式
134140 var b []byte
135141 b , err = CrawlByChrome (urlstr )
@@ -175,7 +181,6 @@ func CrawlHtml2Markdown(urlstr string, contType int, force bool, intelligence in
175181 }
176182 }
177183
178- //TODO:采集并下载图片
179184 ext := strings .ToLower (filepath .Ext (src ))
180185 if strings .HasPrefix (ext , ".jpeg" ) {
181186 ext = ".jpeg"
@@ -226,7 +231,7 @@ func CrawlHtml2Markdown(urlstr string, contType int, force bool, intelligence in
226231 })
227232 }
228233
229- diySelecter = strings .TrimSpace (diySelecter )
234+ diySelector = strings .TrimSpace (diySelector )
230235
231236 cont , err = doc .Html ()
232237
@@ -245,19 +250,19 @@ func CrawlHtml2Markdown(urlstr string, contType int, force bool, intelligence in
245250 case 2 : //=>text
246251 cont = article .Content + fmt .Sprintf ("\n 原文:\n > %v" , urlstr )
247252 default : //0 && other=>markdown
248- cont = html2md .Convert (article .Html ) + fmt .Sprintf ("\n \r \n \r 原文:\n > [%v](%v)" , urlstr , urlstr )
253+ cont = html2md .Convert (article .Html ) + fmt .Sprintf ("\n \r \n \r 原文:\n > %v" , urlstr )
249254 }
250- } else if intelligence == 2 && diySelecter != "" { //自定义提取
251- if htmlstr , err := doc .Find (diySelecter ).Html (); err != nil {
255+ } else if intelligence == 2 && diySelector != "" { //自定义提取
256+ if htmlstr , err := doc .Find (diySelector ).Html (); err != nil {
252257 return "" , err
253258 } else {
254259 switch contType {
255260 case 1 : //=>html
256261 cont = htmlstr + "\n \r \n \r > 原文: " + urlstr
257262 case 2 : //=>text
258- cont = doc .Find (diySelecter ).Text () + fmt .Sprintf ("\n \r \n \r > 原文: %v" , urlstr )
263+ cont = doc .Find (diySelector ).Text () + fmt .Sprintf ("\n \r \n \r > 原文: %v" , urlstr )
259264 default : //0 && other=>markdown
260- cont = html2md .Convert (htmlstr ) + fmt .Sprintf ("\n \r \n \r > 原文: [%v](%v)" , urlstr , urlstr )
265+ cont = html2md .Convert (htmlstr ) + fmt .Sprintf ("\n \r \n \r > 原文: %v" , urlstr )
261266 }
262267 }
263268 } else { //全文
@@ -274,7 +279,7 @@ func CrawlHtml2Markdown(urlstr string, contType int, force bool, intelligence in
274279 cont = doc .Find ("body" ).Text () + fmt .Sprintf ("\n \r \n \r > 原文: %v" , urlstr )
275280 default : //0 && other=>markdown
276281 htmlstr , _ := doc .Find ("body" ).Html ()
277- cont = html2md .Convert (htmlstr ) + fmt .Sprintf ("\n \r \n \r > 原文: [%v](%v)" , urlstr , urlstr )
282+ cont = html2md .Convert (htmlstr ) + fmt .Sprintf ("\n \r \n \r > 原文: %v" , urlstr )
278283 }
279284 }
280285
0 commit comments