@@ -129,8 +129,9 @@ func CrawlByChrome(urlstr string) (b []byte, err error) {
129129//内容类型,contType:0表示markdown,1表示html,2表示文本
130130//force:是否是强力采集
131131//intelligence:是否是智能提取,智能提取,使用html2article,否则提取body
132- func CrawlHtml2Markdown (urlstr string , contType int , force , intelligence bool , headers ... map [string ]string ) (cont string , err error ) {
133- if force {
132+ //diySelecter:自定义选择器
133+ func CrawlHtml2Markdown (urlstr string , contType int , force bool , intelligence int , diySelecter string , headers ... map [string ]string ) (cont string , err error ) {
134+ if force { //强力模式
134135 var b []byte
135136 b , err = CrawlByChrome (urlstr )
136137 cont = string (b )
@@ -184,8 +185,11 @@ func CrawlHtml2Markdown(urlstr string, contType int, force, intelligence bool, h
184185 })
185186 }
186187
188+ diySelecter = strings .TrimSpace (diySelecter )
189+
187190 cont , err = doc .Html ()
188- if intelligence {
191+
192+ if intelligence == 1 { //智能提取
189193 ext , err := html2article .NewFromHtml (cont )
190194 if err != nil {
191195 return cont , err
@@ -196,11 +200,24 @@ func CrawlHtml2Markdown(urlstr string, contType int, force, intelligence bool, h
196200 }
197201 switch contType {
198202 case 1 : //=>html
199- cont = article .Html + "<br/><br/><br/>原文: " + urlstr
203+ cont = article .Html + "\n 原文: \n > " + urlstr
200204 case 2 : //=>text
201- cont = article .Content + fmt .Sprintf ("\n \r \n \r 原文: %v" , urlstr )
205+ cont = article .Content + fmt .Sprintf ("\n 原文: \n > %v" , urlstr )
202206 default : //0 && other=>markdown
203- cont = html2md .Convert (article .Html ) + fmt .Sprintf ("\n \r \n \r 原文:[%v](%v)" , urlstr , urlstr )
207+ cont = html2md .Convert (article .Html ) + fmt .Sprintf ("\n \r \n \r 原文:\n > [%v](%v)" , urlstr , urlstr )
208+ }
209+ } else if intelligence == 2 && diySelecter != "" { //自定义提取
210+ if htmlstr , err := doc .Find (diySelecter ).Html (); err != nil {
211+ return "" , err
212+ } else {
213+ switch contType {
214+ case 1 : //=>html
215+ cont = htmlstr + "\n \n \n 原文:\n > " + urlstr
216+ case 2 : //=>text
217+ cont = doc .Find (diySelecter ).Text () + fmt .Sprintf ("\n \r \n \r 原文:\n > %v" , urlstr )
218+ default : //0 && other=>markdown
219+ cont = html2md .Convert (htmlstr ) + fmt .Sprintf ("\n \r \n \r 原文:\n > [%v](%v)" , urlstr , urlstr )
220+ }
204221 }
205222 } else {
206223 //移除body中的所有js标签
@@ -211,12 +228,12 @@ func CrawlHtml2Markdown(urlstr string, contType int, force, intelligence bool, h
211228 switch contType {
212229 case 1 : //=>html
213230 htmlstr , _ := doc .Find ("body" ).Html ()
214- cont = htmlstr + "<br/><br/><br/>原文: " + urlstr
231+ cont = htmlstr + "\n \n \n 原文: \n > " + urlstr
215232 case 2 : //=>text
216- cont = doc .Find ("body" ).Text () + fmt .Sprintf ("\n \r \n \r 原文:%v" , urlstr )
233+ cont = doc .Find ("body" ).Text () + fmt .Sprintf ("\n \r \n \r 原文:\n > %v" , urlstr )
217234 default : //0 && other=>markdown
218235 htmlstr , _ := doc .Find ("body" ).Html ()
219- cont = html2md .Convert (htmlstr ) + fmt .Sprintf ("\n \r \n \r 原文:[%v](%v)" , urlstr , urlstr )
236+ cont = html2md .Convert (htmlstr ) + fmt .Sprintf ("\n \r \n \r 原文:\n > [%v](%v)" , urlstr , urlstr )
220237 }
221238 }
222239
0 commit comments