Opening PDFs with docbuilder

I am trying to read in a .pdf file using the python docbuilders OpenFile function:


import os
import sys
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Add the current directory to sys.path
current_dir = os.getcwd()
if current_dir not in sys.path:
    sys.path.insert(0, current_dir)

# Add src directory to sys.path
src_dir = os.path.join(current_dir, "src")
if src_dir not in sys.path:
    sys.path.insert(0, src_dir)

# Add the builder directory to sys.path so it can find the local docbuilder.py
builder_dir = os.path.join(current_dir, "builder")
if builder_dir not in sys.path:
    sys.path.insert(0, builder_dir)

import logging
from urllib.parse import urlparse
from src.services.office.builder import DocumentBuilder

# Configure logging to see what's happening
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

def test_document_loading():
    blob_url = "https://example_domain.pdf"
    
    print(f"\n--- Testing DocumentBuilder with URL: {blob_url} ---")
    
    try:
        # 1. Initialize builder
        builder = DocumentBuilder(blob_url)
        
        # 2. Load document (PDF conversion is disabled in DocumentBuilder)
        print("Loading document...")
        builder.load_document_into_builder()
        print("Document loaded successfully into CDocBuilder.")
        
        # 3. Build normalized index (this will trigger GetText and save debug files)
        print("Building normalized index...")
        index = builder.build_normalised_index()
        
        print("\n--- Extraction Results ---")
        print(f"Raw text length: {len(index.raw_text)}")
        print(f"Normalized text length: {len(index.normalised)}")
        
        # Print a sample of the raw text to see separators
        print("\n--- Raw Text Sample (first 500 chars) ---")
        print(repr(index.raw_text[:500]))
        
        print("\n--- Success ---")
        print("Check src/services/office/debug_outputs/ for the extracted files.")

    except Exception as e:
        print(f"\n--- Error ---")
        logger.exception(e)

if __name__ == "__main__":
    test_document_loading()

We receive an error:

PYTHONPATH="$PWD/builder" doppler run -p bidscript-python -c dev_personal -- uv run test_pdf_loading.py

— Testing DocumentBuilder with URL: https://example_domain/tender-docs/a79376e7-6c58-45a6-9cca-596b82c824fa.pdf —
Loading document…
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): example_domain:443
DEBUG:urllib3.connectionpool:https://example_domain:443 “GET /tender-docs/a79376e7-6c58-45a6-9cca-596b82c824fa.pdf HTTP/1.1” 200 578814
DEBUG:src.services.office.builder:Saved loaded file for debug to: ./src/services/office/debug_outputs/loaded_20260208_092711_a79376e7-6c58-45a6-9cca-596b82c824fa.pdf
error: : open file error (88)

— Extraction Results —
Raw text length: 0
Normalized text length: 0

— Raw Text Sample (first 500 chars) —
‘’

— Success —
Check src/services/office/debug_outputs/ for the extracted files.

Hello @Tyler

What is your overall goal for PDF files? Also please let me know version of Document Builder that is being used.

In general, I do not see how OpenFile is used, nor CDocBuilder.

Hey, we have a text search working for docx files. The files are opened using OpenFile in this function:

    def load_document_into_builder(self):
        import docbuilder
        import tempfile
        
        parsed = urlparse(self.blob_url)
        path = unquote(parsed.path or "")
        basename = os.path.basename(path)
        _, ext = os.path.splitext(basename)
        suffix = ext or ".docx"
        content = b""
        ext_lower = ext.lower().lstrip(".")
        if ext_lower in OFFICE_DOC_TYPES:
            self._doc_type = cast(OfficeDocType, ext_lower)

        if ext.lower() == ".pdf":
            content = self._convert_pdf_to_docx("pdf", "docx", self.blob_url)
            suffix = ".docx"
            self._doc_type = "docx"
        else:
            r = requests.get(self.blob_url, timeout=60)
            r.raise_for_status()
            content = r.content

        with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp_file:
            tmp_file.write(content)
            tmp_path = tmp_file.name
        self._source_path = tmp_path

        if ext.lower() in [".xlsx", ".xls"]:
            self.xlsx_data = pd.read_excel(tmp_path, sheet_name=None, header=None)
            self.builder = True # signal loaded
            self._doc_type = "xlsx"
            self._loaded_blob_url = self.blob_url
            self.logger.debug(
                "XLSX successfully loaded via pandas for URL: %s",
                self.blob_url[:80],
            )
            return

        builder = docbuilder.CDocBuilder()
        if not builder:
            raise RuntimeError(f"Failed to create CDocBuilder for {tmp_path}")
        
        params = ""
        if ext.lower() == ".csv":
            params = "<<m_nCsvTxtEncoding>46</m_nCsvTxtEncoding><m_nCsvDelimiter>4</m_nCsvDelimiter>"
        builder.OpenFile(tmp_path, params)
        self.builder = builder
        self._loaded_blob_url = self.blob_url
        self.logger.debug(
            "Document successfully loaded into CDocBuilder for URL: %s",
            self.blob_url[:80],
        )
        self.context = None
        self._normalised_index = None

We then we produce a normalised index of the text to remove separators and enable us to search for chunks of text until we get a hit, once this hit is found we then search raw_text from the editor to retrieve the ranges within the document and expand this hit until we reach a score (from difflib’s SequenceMatcher) that is > 0.98. This enables us to search from the server-side (python backend) using the docbuilder across docx files. We wanted to see if this was possible to copy over to pdf but I think the document models could be different. Even when we convert to docx using the conversion api it does not model the document well enough as when we call .GetText() almost 3/4 of the text data is lost. Instead we use pdf2docx library for pdfs and retain all of the information when calling .GetText() then this is used in the same pipeline. For xlsx we use pandas with openpyxl as the backend to identify the cells containing said text.

Unfortunately, there is no full-fetched Office API tools for regular PDF files. Document Builder cannot directly interact with a PDF document in a DOCX-manner.