Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Containerfile
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ COPY ./scripts ./scripts
# python-devel and pcre-devel are needed for python-openstackclient
RUN if [ "$BUILD_UPSTREAM_DOCS" = "true" ]; then \
dnf install -y graphviz python-devel pcre-devel pip && \
pip install tox html2text && \
bash -c 'curl -L https://github.com/jgm/pandoc/releases/download/3.1.11.1/pandoc-3.1.11.1-linux-amd64.tar.gz | tar -zx --strip-components=1 -C /usr/local/' && \
pip install tox && \
./scripts/get_openstack_plaintext_docs.sh; \
fi

Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
lightspeed-rag-content @ git+https://github.com/lightspeed-core/rag-content@main
packaging
lxml
html2text
59 changes: 50 additions & 9 deletions scripts/generate_embeddings_openstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,28 @@ def clean_url(unclean_url):
class OpenStackDocsMetadataProcessor(MetadataProcessor):
"""Metadata processor for OpenStack documentation."""

API_REF_SERVICE_MAPPING = {
"cinder": "block-storage",
"nova": "compute",
"trove": "database",
"designate": "dns",
"keystone": "identity",
"glance": "image",
"watcher": "resource-optimization",
"masakari": "instance-ha",
"barbican": "key-manager",
"octavia": "load-balancer",
"zaqar": "messaging",
"neutron": "network",
"swift": "object-store",
"adjutant": "registration",
"heat": "orchestration",
"placement": "placement",
"blazar": "reservation",
"manila": "shared-file-system",
# Add more mappings as needed
}

def __init__(self, folder_path: str):
super(OpenStackDocsMetadataProcessor, self).__init__()
self.folder_path = Path(folder_path)
Expand All @@ -45,22 +67,41 @@ def url_function(self, path: str) -> str:
except ValueError:
relative_path = path_obj.name

relative_path = relative_path.as_posix()
relative_path_str = relative_path.as_posix()

# Remove _docs suffix: /cinder/2025.2_docs/ → /cinder/2025.2/
relative_path = re.sub(r"/(\d+\.\d+)_docs/", r"/\1/", relative_path)
# Extract project name from path (first component)
path_parts = relative_path_str.split("/")
project_name = path_parts[0] if path_parts else ""

# Check if this is API-Ref documentation
if "_api-ref/" in relative_path_str:
# This is API-Ref documentation - use different URL pattern
# Pattern: project/version_api-ref/... -> /api-ref/service/...

# Get the service name from mapping
service_name = self.API_REF_SERVICE_MAPPING.get(project_name, project_name)

# Remove _api-ref suffix: /cinder/2025.2_api-ref/ → /cinder/2025.2/api-ref/
relative_path = re.sub(r"/(\d+\.\d+)_api-ref/", r"/\1/api-ref/", relative_path)
# Remove project name and version_api-ref prefix
# Example: heat/2025.2_api-ref/v1/index.txt -> v1/index.txt
api_ref_pattern = re.compile(r"^[^/]+/(?:\d+\.\d+|latest)_api-ref/")
remaining_path = api_ref_pattern.sub("", relative_path_str)

# Replace .txt with .html
remaining_path = remaining_path.replace(".txt", ".html")
# Build API-Ref URL
return f"{self.base_url}/api-ref/{service_name}/{remaining_path}"

# Regular documentation - existing logic
# Remove _docs suffix: /cinder/2025.2_docs/ → /cinder/2025.2/
relative_path_str = re.sub(r"/(\d+\.\d+)_docs/", r"/\1/", relative_path_str)

# Handle "latest" version
relative_path = relative_path.replace("/latest_docs/", "/latest/")
relative_path = relative_path.replace("/latest_api-ref/", "/latest/api-ref/")
relative_path_str = relative_path_str.replace("/latest_docs/", "/latest/")

# Replace .txt with .html
relative_path = relative_path.replace(".txt", ".html")
relative_path_str = relative_path_str.replace(".txt", ".html")

return f"{self.base_url}/{relative_path}"
return f"{self.base_url}/{relative_path_str}"


class RedHatDocsMetadataProcessor(MetadataProcessor):
Expand Down
119 changes: 68 additions & 51 deletions scripts/get_openstack_plaintext_docs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -205,58 +205,75 @@ deps =
# Its regular doc build produces no usable output, but its API-Ref is needed by Neutron.
if [ "$project" != "neutron-lib" ]; then
tox -etext-docs
[ "${CLEAN_FILES}" == "venv" ] && rm -rf .tox/text-docs
fi
[ "${CLEAN_FILES}" == "venv" ] && rm -rf .tox/text-docs

# Build API-Ref if enabled
if [ "$OS_API_DOCS" = "true" ] && [ -d "./api-ref/source" ]; then
if ! grep -q "text-api-ref" tox.ini; then
echo "$tox_text_api_ref_target" >> tox.ini
# Build API-Ref if enabled and the project has an api-ref directory
local api_ref_failed="false"
if [ "$OS_API_DOCS" = "true" ]; then
local api_dir=""
if [ -d "./api-ref/source" ]; then api_dir="api-ref";
elif [ -d "./api-guide/source" ]; then api_dir="api-guide";
else
echo "INFO: No api-ref or api-guide directory found for $project"
fi

local api_ref_failed="false"
echo "Building API-Ref documentation for $project..."
tox -etext-api-ref || api_ref_failed="true"

if [ "$api_ref_failed" != "true" ]; then
echo "Converting API-Ref HTML to plain text..."
rm -rf ./api-ref/build/text
mv ./api-ref/build/html ./api-ref/build/text

# Convert HTML to text
while read -r html_file; do
text_file="${html_file%.html}.txt"
[ -e "$html_file" ] && html2text "$html_file" utf8 > "$text_file"
done <<< "$(find ./api-ref/build/text -name "*.html")"

# Cleanup
find ./api-ref/build/text -type f ! -name "*.txt" -delete
find ./api-ref/build/text -mindepth 1 -depth -type d -empty -delete

# Remove unpublished metadata (JIRA OSPRH-19255 requirement #1)
find ./api-ref/build/text -name "genindex.txt" -delete
find ./api-ref/build/text -name "search.txt" -delete
find ./api-ref/build/text -path "*/_sources/*" -delete
find ./api-ref/build/text -type d -name "_sources" -delete

# index.txt and api_microversion_history.txt handling to prevent unreachable URLs
api_file_count=$(find ./api-ref/build/text -name "*.txt" \
! -name "index.txt" ! -name "genindex.txt" \
! -name "search.txt" ! -name "api_microversion_history.txt" \
-type f | wc -l)

if [ "$api_file_count" -gt 0 ]; then
# Has real API files - remove navigation files
find ./api-ref/build/text -name "index.txt" -delete
find ./api-ref/build/text -name "api_microversion_history.txt" -delete
else
# Only has index.txt - skip to avoid unreachable URLs
echo "Skipping API-Ref for $project (no content files)"
rm -rf ./api-ref/build/text
if [ -n "$api_dir" ]; then
echo "Building API-Ref documentation for $project using $api_dir..."

if ! grep -q "text-api-ref" tox.ini; then
# Adjust the target if it's api-guide instead of api-ref
local adjusted_target
adjusted_target="${tox_text_api_ref_target//api-ref/$api_dir}"
echo "$adjusted_target" >> tox.ini
fi

if ! tox -etext-api-ref; then
echo "WARNING: API-Ref build failed for $project"
exit 1
fi

find ./api-ref/build/text -mindepth 1 -depth -type d -empty -delete 2>/dev/null || true
if [ "$api_ref_failed" != "true" ]; then
echo "Converting API-Ref HTML to plain text for $project..."
rm -rf "./$api_dir/build/text"
mkdir -p "./$api_dir/build/text"

converted_count=0
while IFS= read -r -d '' html_file; do
rel_path="${html_file#./"$api_dir"/build/html/}"
text_file="./$api_dir/build/text/${rel_path%.html}.txt"
mkdir -p "$(dirname "$text_file")"

# Convert HTML to plain text using pandoc (consistent output)
pandoc -f html -t plain --wrap=preserve "$html_file" -o "$text_file" || {
echo "ERROR: Failed to convert $html_file"
return 1
}

converted_count=$((converted_count + 1))
done < <(find "./$api_dir/build/html" -name "*.html" -type f -print0)

echo "Converted $converted_count HTML files to text for $project"

# Cleanup unwanted files (logos, metadata, empty directories)
# shellcheck disable=SC2038
find "./$api_dir/build/text" -type f -exec grep -l "logo-full.svg" {} + | xargs rm -f 2>/dev/null || true
find "./$api_dir/build/text" \( -name "genindex.txt" -o -name "search.txt" \) -delete 2>/dev/null || true
rm -rf "./$api_dir/build/text/_sources" 2>/dev/null || true

# Check for content (size > 1k)
api_file_count=$(find "./$api_dir/build/text" -name "*.txt" -type f -size +1k 2>/dev/null | wc -l)

if [ "$api_file_count" -gt 0 ]; then
echo "API-Ref: Found $api_file_count content files for $project"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't this content calculation message wrong if we remove some files on L273?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

moved the final file count calculation to the very end of the function so the log message accurately reflects the number of files

else
echo "Skipping API-Ref for $project (no content found)"
rm -rf "./$api_dir/build/text"
fi
find "./$api_dir/build/text" -mindepth 1 -depth -type d -empty -delete 2>/dev/null || true
fi
fi
[ "${CLEAN_FILES}" == "venv" ] && rm -rf .tox/text-api-ref
fi

# These projects have all their docs under "latest" instead of "2025.2"
Expand All @@ -271,12 +288,12 @@ deps =
rm -rf "$project_output_dir"
mkdir -p "$project_output_dir"
# Only copy if text docs were built (skipped for neutron-lib)
[ -d "doc/build/text" ] && cp -r doc/build/text "$project_output_dir"/"$_output_version"_docs
[ -d "doc/build/text" ] && cp -r doc/build/text "${project_output_dir}/${_output_version}_docs"

# Copy API-Ref documentation only if OS_API_DOCS is enabled and build succeeded
if [ "$OS_API_DOCS" = "true" ] && [ -d "./api-ref/source" ] && \
[ "$api_ref_failed" != "true" ] && [ -d "api-ref/build/text" ]; then
cp -r api-ref/build/text "$project_output_dir"/"$_output_version"_api-ref
# Copy API-Ref documentation if it was built successfully
if [ "$OS_API_DOCS" = "true" ] && [ "$api_ref_failed" != "true" ] && \
[ -n "$api_dir" ] && [ -d "$api_dir/build/text" ]; then
cp -r "${api_dir}/build/text" "${project_output_dir}/${_output_version}_api-ref"
echo "API-Ref documentation copied for $project"
fi

Expand Down Expand Up @@ -308,7 +325,7 @@ for os_project in "${os_projects[@]}"; do
if [ "${num_running_subproc}" -ge "${NUM_WORKERS}" ]; then
echo "Using ${num_running_subproc}/${NUM_WORKERS} workers. Waiting ..."
wait -n || log_and_die "Subprocess generating text documentation failed!"
echo "Using $(( --num_running_subproc ))/${NUM_WORKERS} workers."
echo "Using $(( --num_running_subproc ))/${NUM_WORKERS} workers."
fi
done

Expand Down