I am trying to read in a .pdf file using the python docbuilders OpenFile function:
import os
import sys
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
# Add the current directory to sys.path
current_dir = os.getcwd()
if current_dir not in sys.path:
sys.path.insert(0, current_dir)
# Add src directory to sys.path
src_dir = os.path.join(current_dir, "src")
if src_dir not in sys.path:
sys.path.insert(0, src_dir)
# Add the builder directory to sys.path so it can find the local docbuilder.py
builder_dir = os.path.join(current_dir, "builder")
if builder_dir not in sys.path:
sys.path.insert(0, builder_dir)
import logging
from urllib.parse import urlparse
from src.services.office.builder import DocumentBuilder
# Configure logging to see what's happening
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
def test_document_loading():
blob_url = "https://example_domain.pdf"
print(f"\n--- Testing DocumentBuilder with URL: {blob_url} ---")
try:
# 1. Initialize builder
builder = DocumentBuilder(blob_url)
# 2. Load document (PDF conversion is disabled in DocumentBuilder)
print("Loading document...")
builder.load_document_into_builder()
print("Document loaded successfully into CDocBuilder.")
# 3. Build normalized index (this will trigger GetText and save debug files)
print("Building normalized index...")
index = builder.build_normalised_index()
print("\n--- Extraction Results ---")
print(f"Raw text length: {len(index.raw_text)}")
print(f"Normalized text length: {len(index.normalised)}")
# Print a sample of the raw text to see separators
print("\n--- Raw Text Sample (first 500 chars) ---")
print(repr(index.raw_text[:500]))
print("\n--- Success ---")
print("Check src/services/office/debug_outputs/ for the extracted files.")
except Exception as e:
print(f"\n--- Error ---")
logger.exception(e)
if __name__ == "__main__":
test_document_loading()
We receive an error:
PYTHONPATH="$PWD/builder" doppler run -p bidscript-python -c dev_personal -- uv run test_pdf_loading.py
— Testing DocumentBuilder with URL: https://example_domain/tender-docs/a79376e7-6c58-45a6-9cca-596b82c824fa.pdf —
Loading document…
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): example_domain:443
DEBUG:urllib3.connectionpool:https://example_domain:443 “GET /tender-docs/a79376e7-6c58-45a6-9cca-596b82c824fa.pdf HTTP/1.1” 200 578814
DEBUG:src.services.office.builder:Saved loaded file for debug to: ./src/services/office/debug_outputs/loaded_20260208_092711_a79376e7-6c58-45a6-9cca-596b82c824fa.pdf
error: : open file error (88)
— Extraction Results —
Raw text length: 0
Normalized text length: 0
— Raw Text Sample (first 500 chars) —
‘’
— Success —
Check src/services/office/debug_outputs/ for the extracted files.