diff --git a/Containerfile b/Containerfile index 4ed6b36..dbb2b5c 100644 --- a/Containerfile +++ b/Containerfile @@ -23,7 +23,8 @@ COPY ./scripts ./scripts # python-devel and pcre-devel are needed for python-openstackclient RUN if [ "$BUILD_UPSTREAM_DOCS" = "true" ]; then \ dnf install -y graphviz python-devel pcre-devel pip && \ - pip install tox html2text && \ + bash -c 'curl -L https://github.com/jgm/pandoc/releases/download/3.1.11.1/pandoc-3.1.11.1-linux-amd64.tar.gz | tar -zx --strip-components=1 -C /usr/local/' && \ + pip install tox && \ ./scripts/get_openstack_plaintext_docs.sh; \ fi diff --git a/requirements.txt b/requirements.txt index 996335f..867454d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ lightspeed-rag-content @ git+https://github.com/lightspeed-core/rag-content@main packaging lxml -html2text diff --git a/scripts/generate_embeddings_openstack.py b/scripts/generate_embeddings_openstack.py index 0563943..6e22d8e 100644 --- a/scripts/generate_embeddings_openstack.py +++ b/scripts/generate_embeddings_openstack.py @@ -32,6 +32,28 @@ def clean_url(unclean_url): class OpenStackDocsMetadataProcessor(MetadataProcessor): """Metadata processor for OpenStack documentation.""" + API_REF_SERVICE_MAPPING = { + "cinder": "block-storage", + "nova": "compute", + "trove": "database", + "designate": "dns", + "keystone": "identity", + "glance": "image", + "watcher": "resource-optimization", + "masakari": "instance-ha", + "barbican": "key-manager", + "octavia": "load-balancer", + "zaqar": "messaging", + "neutron": "network", + "swift": "object-store", + "adjutant": "registration", + "heat": "orchestration", + "placement": "placement", + "blazar": "reservation", + "manila": "shared-file-system", + # Add more mappings as needed + } + def __init__(self, folder_path: str): super(OpenStackDocsMetadataProcessor, self).__init__() self.folder_path = Path(folder_path) @@ -45,22 +67,41 @@ def url_function(self, path: str) -> str: except ValueError: relative_path = path_obj.name - relative_path = relative_path.as_posix() + relative_path_str = relative_path.as_posix() - # Remove _docs suffix: /cinder/2025.2_docs/ → /cinder/2025.2/ - relative_path = re.sub(r"/(\d+\.\d+)_docs/", r"/\1/", relative_path) + # Extract project name from path (first component) + path_parts = relative_path_str.split("/") + project_name = path_parts[0] if path_parts else "" + + # Check if this is API-Ref documentation + if "_api-ref/" in relative_path_str: + # This is API-Ref documentation - use different URL pattern + # Pattern: project/version_api-ref/... -> /api-ref/service/... + + # Get the service name from mapping + service_name = self.API_REF_SERVICE_MAPPING.get(project_name, project_name) - # Remove _api-ref suffix: /cinder/2025.2_api-ref/ → /cinder/2025.2/api-ref/ - relative_path = re.sub(r"/(\d+\.\d+)_api-ref/", r"/\1/api-ref/", relative_path) + # Remove project name and version_api-ref prefix + # Example: heat/2025.2_api-ref/v1/index.txt -> v1/index.txt + api_ref_pattern = re.compile(r"^[^/]+/(?:\d+\.\d+|latest)_api-ref/") + remaining_path = api_ref_pattern.sub("", relative_path_str) + + # Replace .txt with .html + remaining_path = remaining_path.replace(".txt", ".html") + # Build API-Ref URL + return f"{self.base_url}/api-ref/{service_name}/{remaining_path}" + + # Regular documentation - existing logic + # Remove _docs suffix: /cinder/2025.2_docs/ → /cinder/2025.2/ + relative_path_str = re.sub(r"/(\d+\.\d+)_docs/", r"/\1/", relative_path_str) # Handle "latest" version - relative_path = relative_path.replace("/latest_docs/", "/latest/") - relative_path = relative_path.replace("/latest_api-ref/", "/latest/api-ref/") + relative_path_str = relative_path_str.replace("/latest_docs/", "/latest/") # Replace .txt with .html - relative_path = relative_path.replace(".txt", ".html") + relative_path_str = relative_path_str.replace(".txt", ".html") - return f"{self.base_url}/{relative_path}" + return f"{self.base_url}/{relative_path_str}" class RedHatDocsMetadataProcessor(MetadataProcessor): diff --git a/scripts/get_openstack_plaintext_docs.sh b/scripts/get_openstack_plaintext_docs.sh index a178b03..8381da8 100755 --- a/scripts/get_openstack_plaintext_docs.sh +++ b/scripts/get_openstack_plaintext_docs.sh @@ -205,58 +205,75 @@ deps = # Its regular doc build produces no usable output, but its API-Ref is needed by Neutron. if [ "$project" != "neutron-lib" ]; then tox -etext-docs + [ "${CLEAN_FILES}" == "venv" ] && rm -rf .tox/text-docs fi - [ "${CLEAN_FILES}" == "venv" ] && rm -rf .tox/text-docs - # Build API-Ref if enabled - if [ "$OS_API_DOCS" = "true" ] && [ -d "./api-ref/source" ]; then - if ! grep -q "text-api-ref" tox.ini; then - echo "$tox_text_api_ref_target" >> tox.ini + # Build API-Ref if enabled and the project has an api-ref directory + local api_ref_failed="false" + if [ "$OS_API_DOCS" = "true" ]; then + local api_dir="" + if [ -d "./api-ref/source" ]; then api_dir="api-ref"; + elif [ -d "./api-guide/source" ]; then api_dir="api-guide"; + else + echo "INFO: No api-ref or api-guide directory found for $project" fi - local api_ref_failed="false" - echo "Building API-Ref documentation for $project..." - tox -etext-api-ref || api_ref_failed="true" - - if [ "$api_ref_failed" != "true" ]; then - echo "Converting API-Ref HTML to plain text..." - rm -rf ./api-ref/build/text - mv ./api-ref/build/html ./api-ref/build/text - - # Convert HTML to text - while read -r html_file; do - text_file="${html_file%.html}.txt" - [ -e "$html_file" ] && html2text "$html_file" utf8 > "$text_file" - done <<< "$(find ./api-ref/build/text -name "*.html")" - - # Cleanup - find ./api-ref/build/text -type f ! -name "*.txt" -delete - find ./api-ref/build/text -mindepth 1 -depth -type d -empty -delete - - # Remove unpublished metadata (JIRA OSPRH-19255 requirement #1) - find ./api-ref/build/text -name "genindex.txt" -delete - find ./api-ref/build/text -name "search.txt" -delete - find ./api-ref/build/text -path "*/_sources/*" -delete - find ./api-ref/build/text -type d -name "_sources" -delete - - # index.txt and api_microversion_history.txt handling to prevent unreachable URLs - api_file_count=$(find ./api-ref/build/text -name "*.txt" \ - ! -name "index.txt" ! -name "genindex.txt" \ - ! -name "search.txt" ! -name "api_microversion_history.txt" \ - -type f | wc -l) - - if [ "$api_file_count" -gt 0 ]; then - # Has real API files - remove navigation files - find ./api-ref/build/text -name "index.txt" -delete - find ./api-ref/build/text -name "api_microversion_history.txt" -delete - else - # Only has index.txt - skip to avoid unreachable URLs - echo "Skipping API-Ref for $project (no content files)" - rm -rf ./api-ref/build/text + if [ -n "$api_dir" ]; then + echo "Building API-Ref documentation for $project using $api_dir..." + + if ! grep -q "text-api-ref" tox.ini; then + # Adjust the target if it's api-guide instead of api-ref + local adjusted_target + adjusted_target="${tox_text_api_ref_target//api-ref/$api_dir}" + echo "$adjusted_target" >> tox.ini + fi + + if ! tox -etext-api-ref; then + echo "WARNING: API-Ref build failed for $project" + exit 1 fi - find ./api-ref/build/text -mindepth 1 -depth -type d -empty -delete 2>/dev/null || true + if [ "$api_ref_failed" != "true" ]; then + echo "Converting API-Ref HTML to plain text for $project..." + rm -rf "./$api_dir/build/text" + mkdir -p "./$api_dir/build/text" + + converted_count=0 + while IFS= read -r -d '' html_file; do + rel_path="${html_file#./"$api_dir"/build/html/}" + text_file="./$api_dir/build/text/${rel_path%.html}.txt" + mkdir -p "$(dirname "$text_file")" + + # Convert HTML to plain text using pandoc (consistent output) + pandoc -f html -t plain --wrap=preserve "$html_file" -o "$text_file" || { + echo "ERROR: Failed to convert $html_file" + return 1 + } + + converted_count=$((converted_count + 1)) + done < <(find "./$api_dir/build/html" -name "*.html" -type f -print0) + + echo "Converted $converted_count HTML files to text for $project" + + # Cleanup unwanted files (logos, metadata, empty directories) + # shellcheck disable=SC2038 + find "./$api_dir/build/text" -type f -exec grep -l "logo-full.svg" {} + | xargs rm -f 2>/dev/null || true + find "./$api_dir/build/text" \( -name "genindex.txt" -o -name "search.txt" \) -delete 2>/dev/null || true + rm -rf "./$api_dir/build/text/_sources" 2>/dev/null || true + + # Check for content (size > 1k) + api_file_count=$(find "./$api_dir/build/text" -name "*.txt" -type f -size +1k 2>/dev/null | wc -l) + + if [ "$api_file_count" -gt 0 ]; then + echo "API-Ref: Found $api_file_count content files for $project" + else + echo "Skipping API-Ref for $project (no content found)" + rm -rf "./$api_dir/build/text" + fi + find "./$api_dir/build/text" -mindepth 1 -depth -type d -empty -delete 2>/dev/null || true + fi fi + [ "${CLEAN_FILES}" == "venv" ] && rm -rf .tox/text-api-ref fi # These projects have all their docs under "latest" instead of "2025.2" @@ -271,12 +288,12 @@ deps = rm -rf "$project_output_dir" mkdir -p "$project_output_dir" # Only copy if text docs were built (skipped for neutron-lib) - [ -d "doc/build/text" ] && cp -r doc/build/text "$project_output_dir"/"$_output_version"_docs + [ -d "doc/build/text" ] && cp -r doc/build/text "${project_output_dir}/${_output_version}_docs" - # Copy API-Ref documentation only if OS_API_DOCS is enabled and build succeeded - if [ "$OS_API_DOCS" = "true" ] && [ -d "./api-ref/source" ] && \ - [ "$api_ref_failed" != "true" ] && [ -d "api-ref/build/text" ]; then - cp -r api-ref/build/text "$project_output_dir"/"$_output_version"_api-ref + # Copy API-Ref documentation if it was built successfully + if [ "$OS_API_DOCS" = "true" ] && [ "$api_ref_failed" != "true" ] && \ + [ -n "$api_dir" ] && [ -d "$api_dir/build/text" ]; then + cp -r "${api_dir}/build/text" "${project_output_dir}/${_output_version}_api-ref" echo "API-Ref documentation copied for $project" fi @@ -308,7 +325,7 @@ for os_project in "${os_projects[@]}"; do if [ "${num_running_subproc}" -ge "${NUM_WORKERS}" ]; then echo "Using ${num_running_subproc}/${NUM_WORKERS} workers. Waiting ..." wait -n || log_and_die "Subprocess generating text documentation failed!" - echo "Using $(( --num_running_subproc ))/${NUM_WORKERS} workers." + echo "Using $(( --num_running_subproc ))/${NUM_WORKERS} workers." fi done