1717import argparse
1818import subprocess
1919import tempfile
20+ import logging
2021from pathlib import Path
2122from typing import (
2223 Dict ,
@@ -43,6 +44,9 @@ class MineruParser:
4344
4445 __slots__ = ()
4546
47+ # Class-level logger
48+ logger = logging .getLogger (__name__ )
49+
4650 def __init__ (self ) -> None :
4751 """Initialize MineruParser"""
4852 pass
@@ -116,13 +120,13 @@ def _run_mineru_command(
116120 encoding = "utf-8" ,
117121 errors = "ignore" ,
118122 )
119- print ("MinerU command executed successfully" )
123+ logging . info ("MinerU command executed successfully" )
120124 if result .stdout :
121- print (f"Output: { result .stdout } " )
125+ logging . debug (f"Output: { result .stdout } " )
122126 except subprocess .CalledProcessError as e :
123- print (f"Error running mineru command: { e } " )
127+ logging . error (f"Error running mineru command: { e } " )
124128 if e .stderr :
125- print (f"Error details: { e .stderr } " )
129+ logging . error (f"Error details: { e .stderr } " )
126130 raise
127131 except FileNotFoundError :
128132 raise RuntimeError (
@@ -147,12 +151,13 @@ def _read_output_files(
147151 # Look for the generated files
148152 md_file = output_dir / f"{ file_stem } .md"
149153 json_file = output_dir / f"{ file_stem } _content_list.json"
154+ images_base_dir = output_dir # Base directory for images
150155
151- # Check for files in subdirectory (MinerU 2.0 may create subdirectories)
152- subdir = output_dir / file_stem
153- if subdir . exists ():
154- md_file = subdir / method / f"{ file_stem } .md "
155- json_file = subdir / method / f" { file_stem } _content_list.json"
156+ file_stem_subdir = output_dir / file_stem
157+ if file_stem_subdir . exists ():
158+ md_file = file_stem_subdir / method / f" { file_stem } .md"
159+ json_file = file_stem_subdir / method / f"{ file_stem } _content_list.json "
160+ images_base_dir = file_stem_subdir / method
156161
157162 # Read markdown content
158163 md_content = ""
@@ -161,16 +166,38 @@ def _read_output_files(
161166 with open (md_file , "r" , encoding = "utf-8" ) as f :
162167 md_content = f .read ()
163168 except Exception as e :
164- print (f"Warning: Could not read markdown file { md_file } : { e } " )
169+ logging . warning (f"Could not read markdown file { md_file } : { e } " )
165170
166171 # Read JSON content list
167172 content_list = []
168173 if json_file .exists ():
169174 try :
170175 with open (json_file , "r" , encoding = "utf-8" ) as f :
171176 content_list = json .load (f )
177+
178+ # Always fix relative paths in content_list to absolute paths
179+ logging .info (
180+ f"Fixing image paths in { json_file } with base directory: { images_base_dir } "
181+ )
182+ for item in content_list :
183+ if isinstance (item , dict ):
184+ for field_name in [
185+ "img_path" ,
186+ "table_img_path" ,
187+ "equation_img_path" ,
188+ ]:
189+ if field_name in item and item [field_name ]:
190+ img_path = item [field_name ]
191+ absolute_img_path = (
192+ images_base_dir / img_path
193+ ).resolve ()
194+ item [field_name ] = str (absolute_img_path )
195+ logging .debug (
196+ f"Updated { field_name } : { img_path } -> { item [field_name ]} "
197+ )
198+
172199 except Exception as e :
173- print (f"Warning: Could not read JSON file { json_file } : { e } " )
200+ logging . warning (f"Could not read JSON file { json_file } : { e } " )
174201
175202 # # If standard files not found, look for any .md and .json files in the directory
176203 # if not md_content and not content_list:
@@ -254,7 +281,7 @@ def parse_pdf(
254281 return content_list , md_content
255282
256283 except Exception as e :
257- print (f"Error in parse_pdf: { str (e )} " )
284+ logging . error (f"Error in parse_pdf: { str (e )} " )
258285 raise
259286
260287 @staticmethod
@@ -312,7 +339,9 @@ def parse_image(
312339
313340 # If format is not natively supported by MinerU, convert it
314341 if ext not in mineru_supported_formats :
315- print (f"Converting { ext } image to PNG for MinerU compatibility..." )
342+ logging .info (
343+ f"Converting { ext } image to PNG for MinerU compatibility..."
344+ )
316345
317346 try :
318347 from PIL import Image
@@ -352,7 +381,7 @@ def parse_image(
352381
353382 # Save as PNG
354383 img .save (temp_converted_file , "PNG" , optimize = True )
355- print (
384+ logging . info (
356385 f"Successfully converted { image_path .name } to PNG ({ temp_converted_file .stat ().st_size / 1024 :.1f} KB)"
357386 )
358387
@@ -402,7 +431,7 @@ def parse_image(
402431 pass # Ignore cleanup errors
403432
404433 except Exception as e :
405- print (f"Error in parse_image: { str (e )} " )
434+ logging . error (f"Error in parse_image: { str (e )} " )
406435 raise
407436
408437 @staticmethod
@@ -460,7 +489,7 @@ def parse_office_doc(
460489 )
461490 libreoffice_available = True
462491 working_libreoffice_cmd = "libreoffice"
463- print (f"LibreOffice detected: { result .stdout .strip ()} " )
492+ logging . info (f"LibreOffice detected: { result .stdout .strip ()} " )
464493 except (
465494 subprocess .CalledProcessError ,
466495 FileNotFoundError ,
@@ -482,7 +511,7 @@ def parse_office_doc(
482511 )
483512 libreoffice_available = True
484513 working_libreoffice_cmd = cmd
485- print (
514+ logging . info (
486515 f"LibreOffice detected with command '{ cmd } ': { result .stdout .strip ()} "
487516 )
488517 break
@@ -510,7 +539,7 @@ def parse_office_doc(
510539 temp_path = Path (temp_dir )
511540
512541 # Convert to PDF using LibreOffice
513- print (f"Converting { doc_path .name } to PDF using LibreOffice..." )
542+ logging . info (f"Converting { doc_path .name } to PDF using LibreOffice..." )
514543
515544 # Use the working LibreOffice command first, then try alternatives if it fails
516545 commands_to_try = [working_libreoffice_cmd ]
@@ -543,16 +572,20 @@ def parse_office_doc(
543572
544573 if result .returncode == 0 :
545574 conversion_successful = True
546- print (f"Successfully converted { doc_path .name } to PDF" )
575+ logging .info (
576+ f"Successfully converted { doc_path .name } to PDF"
577+ )
547578 break
548579 else :
549- print (
580+ logging . warning (
550581 f"LibreOffice command '{ cmd } ' failed: { result .stderr } "
551582 )
552583 except subprocess .TimeoutExpired :
553- print (f"LibreOffice command '{ cmd } ' timed out" )
584+ logging . warning (f"LibreOffice command '{ cmd } ' timed out" )
554585 except Exception as e :
555- print (f"LibreOffice command '{ cmd } ' failed with exception: { e } " )
586+ logging .error (
587+ f"LibreOffice command '{ cmd } ' failed with exception: { e } "
588+ )
556589
557590 if not conversion_successful :
558591 raise RuntimeError (
@@ -569,7 +602,7 @@ def parse_office_doc(
569602 )
570603
571604 pdf_path = pdf_files [0 ]
572- print (
605+ logging . info (
573606 f"Generated PDF: { pdf_path .name } ({ pdf_path .stat ().st_size } bytes)"
574607 )
575608
@@ -586,7 +619,7 @@ def parse_office_doc(
586619 )
587620
588621 except Exception as e :
589- print (f"Error in parse_office_doc: { str (e )} " )
622+ logging . error (f"Error in parse_office_doc: { str (e )} " )
590623 raise
591624
592625 @staticmethod
@@ -630,7 +663,7 @@ def parse_text_file(
630663 try :
631664 with open (text_path , "r" , encoding = encoding ) as f :
632665 text_content = f .read ()
633- print (f"Successfully read file with { encoding } encoding" )
666+ logging . info (f"Successfully read file with { encoding } encoding" )
634667 break
635668 except UnicodeDecodeError :
636669 continue
@@ -645,7 +678,7 @@ def parse_text_file(
645678 pdf_path = temp_path / f"{ text_path .stem } .pdf"
646679
647680 # Convert text to PDF
648- print (f"Converting { text_path .name } to PDF..." )
681+ logging . info (f"Converting { text_path .name } to PDF..." )
649682
650683 try :
651684 from reportlab .lib .pagesizes import A4
@@ -960,15 +993,17 @@ def parse_text_file(
960993 )
961994 )
962995 story .append (Spacer (1 , 12 ))
963- print (f" 📷 Added image: { img_path .name } " )
996+ logging .info (
997+ f" 📷 Added image: { img_path .name } "
998+ )
964999 except Exception as e :
9651000 story .append (
9661001 Paragraph (
9671002 f"[Image loading failed: { alt_text } ]" ,
9681003 normal_style ,
9691004 )
9701005 )
971- print (
1006+ logging . warning (
9721007 f" ⚠️ Failed to load image { img_path } : { e } "
9731008 )
9741009 else :
@@ -978,7 +1013,9 @@ def parse_text_file(
9781013 normal_style ,
9791014 )
9801015 )
981- print (f" ⚠️ Image not found: { img_src } " )
1016+ logging .warning (
1017+ f" ⚠️ Image not found: { img_src } "
1018+ )
9821019
9831020 # Block quotes
9841021 elif line .startswith (">" ):
@@ -1057,7 +1094,7 @@ def parse_text_file(
10571094
10581095 else :
10591096 # Handle plain text files (.txt)
1060- print (
1097+ logging . info (
10611098 f"Processing plain text file with { len (text_content )} characters..."
10621099 )
10631100
@@ -1086,15 +1123,15 @@ def parse_text_file(
10861123 story .append (Paragraph (safe_line , normal_style ))
10871124 story .append (Spacer (1 , 3 ))
10881125
1089- print (f"Added { line_count } lines to PDF" )
1126+ logging . info (f"Added { line_count } lines to PDF" )
10901127
10911128 # If no content was added, add a placeholder
10921129 if not story :
10931130 story .append (Paragraph ("(Empty text file)" , normal_style ))
10941131
10951132 # Build PDF
10961133 doc .build (story )
1097- print (
1134+ logging . info (
10981135 f"Successfully converted { text_path .name } to PDF ({ pdf_path .stat ().st_size / 1024 :.1f} KB)"
10991136 )
11001137
@@ -1120,7 +1157,7 @@ def parse_text_file(
11201157 )
11211158
11221159 except Exception as e :
1123- print (f"Error in parse_text_file: { str (e )} " )
1160+ logging . error (f"Error in parse_text_file: { str (e )} " )
11241161 raise
11251162
11261163 @staticmethod
@@ -1158,7 +1195,7 @@ def parse_document(
11581195 elif ext in [".jpg" , ".jpeg" , ".png" , ".bmp" , ".tiff" , ".tif" , ".gif" , ".webp" ]:
11591196 return MineruParser .parse_image (file_path , output_dir , lang , ** kwargs )
11601197 elif ext in [".doc" , ".docx" , ".ppt" , ".pptx" , ".xls" , ".xlsx" ]:
1161- print (
1198+ logging . warning (
11621199 f"Warning: Office document detected ({ ext } ). "
11631200 f"MinerU 2.0 requires conversion to PDF first."
11641201 )
@@ -1167,7 +1204,7 @@ def parse_document(
11671204 return MineruParser .parse_text_file (file_path , output_dir , lang , ** kwargs )
11681205 else :
11691206 # For unsupported file types, try as PDF
1170- print (
1207+ logging . warning (
11711208 f"Warning: Unsupported file extension '{ ext } ', "
11721209 f"attempting to parse as PDF"
11731210 )
@@ -1190,10 +1227,10 @@ def check_installation() -> bool:
11901227 encoding = "utf-8" ,
11911228 errors = "ignore" ,
11921229 )
1193- print (f"MinerU version: { result .stdout .strip ()} " )
1230+ logging . debug (f"MinerU version: { result .stdout .strip ()} " )
11941231 return True
11951232 except (subprocess .CalledProcessError , FileNotFoundError ):
1196- print (
1233+ logging . debug (
11971234 "MinerU 2.0 is not properly installed. "
11981235 "Please install it using: pip install -U 'mineru[core]'"
11991236 )
0 commit comments