Spring AI ETL PDF Paragraph Document Reader

The PDF or Portable Document Format provides an easy way to transfer text documents, To make PDF navigation easy, it is good practice to add a Content Catalog or Table of Contents of the PDF in the initial pages. The Spring AI ETL PDF Paragraph Document Reader uses the information provided in the PDF catalog to split the long PDF into small chunks of text paragraphs and output them as a single Document per paragraph.

Not all PDF documents provide the PDF catalog.

package com.example.springai.controller;

import org.springframework.ai.document.Document;
import org.springframework.ai.reader.ExtractedTextFormatter;
import org.springframework.ai.reader.pdf.ParagraphPdfDocumentReader;
import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.core.io.Resource;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RestController;

import java.util.List;

@RestController
public class SpringAiController {
    private final Resource resource;

    public SpringAiController(@Value("classpath:spring-boot-reference.pdf") Resource resource) {
        this.resource = resource;
    }

    @GetMapping("/pdfDocument")
    List<Document> pdfDocument() {
        ParagraphPdfDocumentReader pdfReader = new ParagraphPdfDocumentReader(resource, PdfDocumentReaderConfig.builder()
                .withPageTopMargin(0)
                .withPageExtractedTextFormatter(ExtractedTextFormatter.builder()
                        .withNumberOfTopTextLinesToDelete(0)
                        .build())
                .withPagesPerDocument(1)
                .build());
        return pdfReader.read();
    }
}
package com.example.springai;

import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;

@SpringBootApplication
public class SpringAiApplication {
    public static void main(String[] args) {
        SpringApplication.run(SpringAiApplication.class, args);
    }
}
spring.application.name=SpringAi
spring.docker.compose.lifecycle-management=start-only
spring.threads.virtual.enabled=true 
# The default Ollama Model in Spring Ai is mistral, but it can be changed by setting the below property. make sure to download the same model in entrypoint.sh file
#spring.ai.ollama.chat.options.model=llama3.1
# If running the Ollama Docker Instance separately, then set this property
spring.docker.compose.enabled=false
services:
  ollama-model:
    image: ollama/ollama:latest
    container_name: ollama_container
    ports:
      - 11434:11434/tcp
    healthcheck:
      test: ollama --version || exit 1
    command: serve
    volumes:
      - ./ollama/ollama:/root/.ollama
      - ./entrypoint.sh:/entrypoint.sh
    pull_policy: missing
    tty: true
    restart: no
    entrypoint: [ "/usr/bin/bash", "/entrypoint.sh" ]

  open-webui:
    image: ghcr.io/open-webui/open-webui:main
    container_name: open_webui_container
    environment:
      WEBUI_AUTH: false
    ports:
      - "8081:8080"
    extra_hosts:
      - "host.docker.internal:host-gateway"
    volumes:
      - open-webui:/app/backend/data
    restart: no

volumes:
  open-webui:
#!/bin/bash
# Start Ollama in the background.
/bin/ollama serve &
# Record Process ID.
pid=$!
# Pause for Ollama to start.
sleep 5
# The default Ollama Model in Spring Ai is mistral, but it can be changed in the applications property file. Make sure to download the same Model here
echo "🔴 Retrieve LLAMA3 model..."
ollama pull mistral
echo "🟢 Done!"
# Wait for the Ollama process to finish.
wait $pid
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://maven.apache.org/POM/4.0.0"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-parent</artifactId>
        <version>3.3.2</version>
        <relativePath/>
    </parent>
    <groupId>com.example.springai</groupId>
    <artifactId>etl-pipeline-text-pdf-paragraph</artifactId>
    <version>0.0.1-SNAPSHOT</version>
    <name>ETL Pipeline Text PDF Paragraph</name>
    <description>Demo project for Spring Boot</description>

    <properties>
        <java.version>21</java.version>
        <spring-ai.version>1.0.0-SNAPSHOT</spring-ai.version>
    </properties>
    <dependencies>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-web</artifactId>
        </dependency>
        <dependency>
            <groupId>org.springframework.ai</groupId>
            <artifactId>spring-ai-ollama-spring-boot-starter</artifactId>
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-docker-compose</artifactId>
            <scope>runtime</scope>
            <optional>true</optional>
        </dependency>
        <dependency>
            <groupId>org.springframework.ai</groupId>
            <artifactId>spring-ai-pdf-document-reader</artifactId>
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-test</artifactId>
            <scope>test</scope>
        </dependency>
    </dependencies>
    <dependencyManagement>
        <dependencies>
            <dependency>
                <groupId>org.springframework.ai</groupId>
                <artifactId>spring-ai-bom</artifactId>
                <version>${spring-ai.version}</version>
                <type>pom</type>
                <scope>import</scope>
            </dependency>
        </dependencies>
    </dependencyManagement>

    <build>
        <plugins>
            <plugin>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-maven-plugin</artifactId>
                <configuration>
                    <mainClass>com.example.springai.SpringAiApplication</mainClass>
                    <excludes>
                        <exclude>
                            <groupId>org.projectlombok</groupId>
                            <artifactId>lombok</artifactId>
                        </exclude>
                    </excludes>
                </configuration>
            </plugin>
        </plugins>
    </build>
    <repositories>
        <repository>
            <id>spring-milestones</id>
            <name>Spring Milestones</name>
            <url>https://repo.spring.io/milestone</url>
            <snapshots>
                <enabled>false</enabled>
            </snapshots>
        </repository>
        <repository>
            <id>spring-snapshots</id>
            <name>Spring Snapshots</name>
            <url>https://repo.spring.io/snapshot</url>
            <releases>
                <enabled>false</enabled>
            </releases>
        </repository>
    </repositories>
</project>

Run the curl to see the Spring AI ETL PDF Paragraph Document Reader

http://localhost:8080/pdfDocument

follow us on