Spring AI ETL Token Text Splitter Transformer Customized

The Spring AI ETL Token Text Splitter Transformer is an implementation of TextSplitter that splits large text into small chunks of Documents based on the token count, it uses the CL100K_BASE encoding. The Spring AI ETL Token Text Splitter Transformer Customized constructor allows to customize some parameters based on requirements, such as chunkSize, minChunkSizeChars, minChunkLengthToEmbed, maxNumChunks, etc

package com.example.springai.controller;

import org.springframework.ai.chat.model.ChatModel;
import org.springframework.ai.document.Document;
import org.springframework.ai.reader.TextReader;
import org.springframework.ai.transformer.splitter.TokenTextSplitter;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.core.io.Resource;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RestController;

import java.util.List;

@RestController
public class SpringAiController {
    private final Resource apples;
    @Autowired
    private ChatModel chatModel;

    public SpringAiController(@Value("classpath:apples.st") Resource apples) {
        this.apples = apples;
    }

    @GetMapping("/textDocument")
    List<Document> textDocument() {
        var textReader = new TextReader(apples);
        var documents = textReader.get();
        var splitter = new TokenTextSplitter(true);
        return splitCustomized(splitter.apply(documents));
    }

    public List<Document> splitCustomized(List<Document> documents) {
        TokenTextSplitter splitter = new TokenTextSplitter(100, 40, 10, 500, true);
        return splitter.apply(documents);
    }
}

package com.example.springai;

import org.springframework.ai.chat.model.ChatModel;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;

@SpringBootApplication
public class SpringAiApplication {
    @Autowired
    private ChatModel chatModel;

    public static void main(String[] args) {
        SpringApplication.run(SpringAiApplication.class, args);
    }
}

create a file apples.st in src/main/resources directory

spring.application.name=SpringAi
spring.docker.compose.lifecycle-management=start-only
spring.threads.virtual.enabled=true
# The default Ollama Model in Spring Ai is mistral, but it can be changed by setting the below property. make sure to download the same model in entrypoint.sh file
#spring.ai.ollama.chat.options.model=llama3.1
# If running the Ollama Docker Instance separately, then set this property
spring.docker.compose.enabled=false

services:
  ollama-model:
    image: ollama/ollama:latest
    container_name: ollama_container
    ports:
      - 11434:11434/tcp
    healthcheck:
      test: ollama --version || exit 1
    command: serve
    volumes:
      - ./ollama/ollama:/root/.ollama
      - ./entrypoint.sh:/entrypoint.sh
    pull_policy: missing
    tty: true
    restart: no
    entrypoint: [ "/usr/bin/bash", "/entrypoint.sh" ]

  open-webui:
    image: ghcr.io/open-webui/open-webui:main
    container_name: open_webui_container
    environment:
      WEBUI_AUTH: false
    ports:
      - "8081:8080"
    extra_hosts:
      - "host.docker.internal:host-gateway"
    volumes:
      - open-webui:/app/backend/data
    restart: no

volumes:
  open-webui:

#!/bin/bash
# Start Ollama in the background.
/bin/ollama serve &
# Record Process ID.
pid=$!
# Pause for Ollama to start.
sleep 5
# The default Ollama Model in Spring Ai is mistral, but it can be changed in the applications property file. Make sure to download the same Model here
echo "🔴 Retrieve LLAMA3 model..."
ollama pull mistral
echo "🟢 Done!"
# Wait for the Ollama process to finish.
wait $pid

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://maven.apache.org/POM/4.0.0"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-parent</artifactId>
        <version>3.3.2</version>
        <relativePath/>
    </parent>
    <groupId>com.example.springai</groupId>
    <artifactId>etl_pipeline_token_text_splitter</artifactId>
    <version>0.0.1-SNAPSHOT</version>
    <name>ETL Pipeline Token Text Splitter</name>
    <description>Demo project for Spring Boot</description>

    <properties>
        <java.version>21</java.version>
        <spring-ai.version>1.0.0-SNAPSHOT</spring-ai.version>
    </properties>
    <dependencies>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-web</artifactId>
        </dependency>
        <dependency>
            <groupId>org.springframework.ai</groupId>
            <artifactId>spring-ai-ollama-spring-boot-starter</artifactId>
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-docker-compose</artifactId>
            <scope>runtime</scope>
            <optional>true</optional>
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-test</artifactId>
            <scope>test</scope>
        </dependency>
    </dependencies>
    <dependencyManagement>
        <dependencies>
            <dependency>
                <groupId>org.springframework.ai</groupId>
                <artifactId>spring-ai-bom</artifactId>
                <version>${spring-ai.version}</version>
                <type>pom</type>
                <scope>import</scope>
            </dependency>
        </dependencies>
    </dependencyManagement>

    <build>
        <plugins>
            <plugin>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-maven-plugin</artifactId>
                <configuration>
                    <mainClass>com.example.springai.SpringAiApplication</mainClass>
                    <excludes>
                        <exclude>
                            <groupId>org.projectlombok</groupId>
                            <artifactId>lombok</artifactId>
                        </exclude>
                    </excludes>
                </configuration>
            </plugin>
        </plugins>
    </build>
    <repositories>
        <repository>
            <id>spring-milestones</id>
            <name>Spring Milestones</name>
            <url>https://repo.spring.io/milestone</url>
            <snapshots>
                <enabled>false</enabled>
            </snapshots>
        </repository>
        <repository>
            <id>spring-snapshots</id>
            <name>Spring Snapshots</name>
            <url>https://repo.spring.io/snapshot</url>
            <releases>
                <enabled>false</enabled>
            </releases>
        </repository>
    </repositories>
</project>

Run the curl to see the Spring AI ETL Document JSON Pointer Reader

http://localhost:8080/jsonDocument

Checkout Spring AI ETL Token Text Splitter Transformer Customized from GitHub

Spring AI ETL Token Text Splitter Transformer Customized

follow us on