openstack-lightspeed · omkarjoshi0304 · Jan 14, 2026 · Feb 17, 2026 · Feb 17, 2026 · Akrog
@@ -23,7 +23,8 @@ COPY ./scripts ./scripts
 # python-devel and pcre-devel are needed for python-openstackclient
 RUN if [ "$BUILD_UPSTREAM_DOCS" = "true" ]; then \
         dnf install -y graphviz python-devel pcre-devel pip && \
-        pip install tox html2text && \
+        bash -c 'curl -L https://github.com/jgm/pandoc/releases/download/3.1.11.1/pandoc-3.1.11.1-linux-amd64.tar.gz | tar -zx --strip-components=1 -C /usr/local/' && \
+        pip install tox && \
         ./scripts/get_openstack_plaintext_docs.sh; \
     fi
 

@@ -1,4 +1,3 @@
 lightspeed-rag-content @ git+https://github.com/lightspeed-core/rag-content@main
 packaging
 lxml
-html2text
@@ -32,6 +32,28 @@ def clean_url(unclean_url):
 class OpenStackDocsMetadataProcessor(MetadataProcessor):
     """Metadata processor for OpenStack documentation."""
 
+    API_REF_SERVICE_MAPPING = {
+        "cinder": "block-storage",
+        "nova": "compute",
+        "trove": "database",
+        "designate": "dns",
+        "keystone": "identity",
+        "glance": "image",
+        "watcher": "resource-optimization",
+        "masakari": "instance-ha",
+        "barbican": "key-manager",
+        "octavia": "load-balancer",
+        "zaqar": "messaging",
+        "neutron": "network",
+        "swift": "object-store",
+        "adjutant": "registration",
+        "heat": "orchestration",
+        "placement": "placement",
+        "blazar": "reservation",
+        "manila": "shared-file-system",
+        # Add more mappings as needed
+    }
+
     def __init__(self, folder_path: str):
         super(OpenStackDocsMetadataProcessor, self).__init__()
         self.folder_path = Path(folder_path)
@@ -45,22 +67,41 @@ def url_function(self, path: str) -> str:
         except ValueError:
             relative_path = path_obj.name
 
-        relative_path = relative_path.as_posix()
+        relative_path_str = relative_path.as_posix()
 
-        # Remove _docs suffix: /cinder/2025.2_docs/ → /cinder/2025.2/
-        relative_path = re.sub(r"/(\d+\.\d+)_docs/", r"/\1/", relative_path)
+        # Extract project name from path (first component)
+        path_parts = relative_path_str.split("/")
+        project_name = path_parts[0] if path_parts else ""
+
+        # Check if this is API-Ref documentation
+        if "_api-ref/" in relative_path_str:
+            # This is API-Ref documentation - use different URL pattern
+            # Pattern: project/version_api-ref/... -> /api-ref/service/...
+
+            # Get the service name from mapping
+            service_name = self.API_REF_SERVICE_MAPPING.get(project_name, project_name)
 
-        # Remove _api-ref suffix: /cinder/2025.2_api-ref/ → /cinder/2025.2/api-ref/
-        relative_path = re.sub(r"/(\d+\.\d+)_api-ref/", r"/\1/api-ref/", relative_path)
+            # Remove project name and version_api-ref prefix
+            # Example: heat/2025.2_api-ref/v1/index.txt -> v1/index.txt
+            api_ref_pattern = re.compile(r"^[^/]+/(?:\d+\.\d+|latest)_api-ref/")
+            remaining_path = api_ref_pattern.sub("", relative_path_str)
+
+            # Replace .txt with .html
+            remaining_path = remaining_path.replace(".txt", ".html")
+            # Build API-Ref URL
+            return f"{self.base_url}/api-ref/{service_name}/{remaining_path}"
+
+        # Regular documentation - existing logic
+        # Remove _docs suffix: /cinder/2025.2_docs/ → /cinder/2025.2/
+        relative_path_str = re.sub(r"/(\d+\.\d+)_docs/", r"/\1/", relative_path_str)
 
         # Handle "latest" version
-        relative_path = relative_path.replace("/latest_docs/", "/latest/")
-        relative_path = relative_path.replace("/latest_api-ref/", "/latest/api-ref/")
+        relative_path_str = relative_path_str.replace("/latest_docs/", "/latest/")
 
         # Replace .txt with .html
-        relative_path = relative_path.replace(".txt", ".html")
+        relative_path_str = relative_path_str.replace(".txt", ".html")
 
-        return f"{self.base_url}/{relative_path}"
+        return f"{self.base_url}/{relative_path_str}"
 
 
 class RedHatDocsMetadataProcessor(MetadataProcessor):

@@ -205,58 +205,75 @@ deps =
     # Its regular doc build produces no usable output, but its API-Ref is needed by Neutron.
     if [ "$project" != "neutron-lib" ]; then
         tox -etext-docs
+        [ "${CLEAN_FILES}" == "venv" ] && rm -rf .tox/text-docs
     fi
-    [ "${CLEAN_FILES}" == "venv" ] && rm -rf .tox/text-docs
 
-    # Build API-Ref if enabled
-    if [ "$OS_API_DOCS" = "true" ] && [ -d "./api-ref/source" ]; then
-        if ! grep -q "text-api-ref" tox.ini; then
-            echo "$tox_text_api_ref_target" >> tox.ini
+    # Build API-Ref if enabled and the project has an api-ref directory
+    local api_ref_failed="false"
+    if [ "$OS_API_DOCS" = "true" ]; then
+        local api_dir=""
+        if [ -d "./api-ref/source" ]; then api_dir="api-ref";
+        elif [ -d "./api-guide/source" ]; then api_dir="api-guide";
+        else
+            echo "INFO: No api-ref or api-guide directory found for $project"
         fi
 
-        local api_ref_failed="false"
-        echo "Building API-Ref documentation for $project..."
-        tox -etext-api-ref || api_ref_failed="true"
-
-        if [ "$api_ref_failed" != "true" ]; then
-            echo "Converting API-Ref HTML to plain text..."
-            rm -rf ./api-ref/build/text
-            mv ./api-ref/build/html ./api-ref/build/text
-
-            # Convert HTML to text
-            while read -r html_file; do
-                text_file="${html_file%.html}.txt"
-                [ -e "$html_file" ] && html2text "$html_file" utf8 > "$text_file"
-            done <<< "$(find ./api-ref/build/text -name "*.html")"
-
-            # Cleanup
-            find ./api-ref/build/text -type f ! -name "*.txt" -delete
-            find ./api-ref/build/text -mindepth 1 -depth -type d -empty -delete
-
-            # Remove unpublished metadata (JIRA OSPRH-19255 requirement #1)
-            find ./api-ref/build/text -name "genindex.txt" -delete
-            find ./api-ref/build/text -name "search.txt" -delete
-            find ./api-ref/build/text -path "*/_sources/*" -delete
-            find ./api-ref/build/text -type d -name "_sources" -delete
-
-            # index.txt and api_microversion_history.txt handling to prevent unreachable URLs
-            api_file_count=$(find ./api-ref/build/text -name "*.txt" \
-                ! -name "index.txt" ! -name "genindex.txt" \
-                ! -name "search.txt" ! -name "api_microversion_history.txt" \
-                -type f | wc -l)
-
-            if [ "$api_file_count" -gt 0 ]; then
-                # Has real API files - remove navigation files
-                find ./api-ref/build/text -name "index.txt" -delete
-                find ./api-ref/build/text -name "api_microversion_history.txt" -delete
-            else
-                # Only has index.txt - skip to avoid unreachable URLs
-                echo "Skipping API-Ref for $project (no content files)"
-                rm -rf ./api-ref/build/text
+        if [ -n "$api_dir" ]; then
+            echo "Building API-Ref documentation for $project using $api_dir..."
+
+            if ! grep -q "text-api-ref" tox.ini; then
+                # Adjust the target if it's api-guide instead of api-ref
+                local adjusted_target
+                adjusted_target="${tox_text_api_ref_target//api-ref/$api_dir}"
+                echo "$adjusted_target" >> tox.ini
+            fi
+
+            if ! tox -etext-api-ref; then
+                echo "WARNING: API-Ref build failed for $project"
+                exit 1
             fi
 
-            find ./api-ref/build/text -mindepth 1 -depth -type d -empty -delete 2>/dev/null || true
+            if [ "$api_ref_failed" != "true" ]; then
+                echo "Converting API-Ref HTML to plain text for $project..."
+                rm -rf "./$api_dir/build/text"
+                mkdir -p "./$api_dir/build/text"
+
+                converted_count=0
+                while IFS= read -r -d '' html_file; do
+                    rel_path="${html_file#./"$api_dir"/build/html/}"
+                    text_file="./$api_dir/build/text/${rel_path%.html}.txt"
+                    mkdir -p "$(dirname "$text_file")"
+
+                    # Convert HTML to plain text using pandoc (consistent output)
+                    pandoc -f html -t plain --wrap=preserve "$html_file" -o "$text_file" || {
+                        echo "ERROR: Failed to convert $html_file"
+                        return 1
+                    }
+
+                    converted_count=$((converted_count + 1))
+                done < <(find "./$api_dir/build/html" -name "*.html" -type f -print0)
+
+                echo "Converted $converted_count HTML files to text for $project"
+
+                # Cleanup unwanted files (logos, metadata, empty directories)
+                # shellcheck disable=SC2038
+                find "./$api_dir/build/text" -type f -exec grep -l "logo-full.svg" {} + | xargs rm -f 2>/dev/null || true
+                find "./$api_dir/build/text" \( -name "genindex.txt" -o -name "search.txt" \) -delete 2>/dev/null || true
+                rm -rf "./$api_dir/build/text/_sources" 2>/dev/null || true
+
+                # Check for content (size > 1k)
+                api_file_count=$(find "./$api_dir/build/text" -name "*.txt" -type f -size +1k 2>/dev/null | wc -l)
+
+                if [ "$api_file_count" -gt 0 ]; then
+                    echo "API-Ref: Found $api_file_count content files for $project"
+                else
+                    echo "Skipping API-Ref for $project (no content found)"
+                    rm -rf "./$api_dir/build/text"
+                fi
+                find "./$api_dir/build/text" -mindepth 1 -depth -type d -empty -delete 2>/dev/null || true
+            fi
         fi
+        [ "${CLEAN_FILES}" == "venv" ] && rm -rf .tox/text-api-ref
     fi
 
     # These projects have all their docs under "latest" instead of "2025.2"
@@ -271,12 +288,12 @@ deps =
     rm -rf "$project_output_dir"
     mkdir -p "$project_output_dir"
     # Only copy if text docs were built (skipped for neutron-lib)
-    [ -d "doc/build/text" ] && cp -r doc/build/text "$project_output_dir"/"$_output_version"_docs
+    [ -d "doc/build/text" ] && cp -r doc/build/text "${project_output_dir}/${_output_version}_docs"
 
-    # Copy API-Ref documentation only if OS_API_DOCS is enabled and build succeeded
-    if [ "$OS_API_DOCS" = "true" ] && [ -d "./api-ref/source" ] && \
-       [ "$api_ref_failed" != "true" ] && [ -d "api-ref/build/text" ]; then
-        cp -r api-ref/build/text "$project_output_dir"/"$_output_version"_api-ref
+    # Copy API-Ref documentation if it was built successfully
+    if [ "$OS_API_DOCS" = "true" ] && [ "$api_ref_failed" != "true" ] && \
+       [ -n "$api_dir" ] && [ -d "$api_dir/build/text" ]; then
+        cp -r "${api_dir}/build/text" "${project_output_dir}/${_output_version}_api-ref"
         echo "API-Ref documentation copied for $project"
     fi
 
@@ -308,7 +325,7 @@ for os_project in "${os_projects[@]}"; do
     if [ "${num_running_subproc}" -ge "${NUM_WORKERS}" ]; then
         echo "Using ${num_running_subproc}/${NUM_WORKERS} workers. Waiting ..."
         wait -n || log_and_die "Subprocess generating text documentation failed!"
-	echo "Using $(( --num_running_subproc ))/${NUM_WORKERS} workers."
+        echo "Using $(( --num_running_subproc ))/${NUM_WORKERS} workers."
     fi
 done