-
Notifications
You must be signed in to change notification settings - Fork 8
feat: improve API-ref extraction quality and coverage in the RAG #83
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,3 @@ | ||
| lightspeed-rag-content @ git+https://github.com/lightspeed-core/rag-content@main | ||
| packaging | ||
| lxml | ||
| html2text |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -205,58 +205,75 @@ deps = | |
| # Its regular doc build produces no usable output, but its API-Ref is needed by Neutron. | ||
| if [ "$project" != "neutron-lib" ]; then | ||
| tox -etext-docs | ||
| [ "${CLEAN_FILES}" == "venv" ] && rm -rf .tox/text-docs | ||
| fi | ||
| [ "${CLEAN_FILES}" == "venv" ] && rm -rf .tox/text-docs | ||
|
|
||
| # Build API-Ref if enabled | ||
| if [ "$OS_API_DOCS" = "true" ] && [ -d "./api-ref/source" ]; then | ||
| if ! grep -q "text-api-ref" tox.ini; then | ||
| echo "$tox_text_api_ref_target" >> tox.ini | ||
| # Build API-Ref if enabled and the project has an api-ref directory | ||
| local api_ref_failed="false" | ||
| if [ "$OS_API_DOCS" = "true" ]; then | ||
| local api_dir="" | ||
| if [ -d "./api-ref/source" ]; then api_dir="api-ref"; | ||
| elif [ -d "./api-guide/source" ]; then api_dir="api-guide"; | ||
omkarjoshi0304 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| else | ||
| echo "INFO: No api-ref or api-guide directory found for $project" | ||
| fi | ||
|
|
||
| local api_ref_failed="false" | ||
| echo "Building API-Ref documentation for $project..." | ||
| tox -etext-api-ref || api_ref_failed="true" | ||
|
|
||
| if [ "$api_ref_failed" != "true" ]; then | ||
| echo "Converting API-Ref HTML to plain text..." | ||
| rm -rf ./api-ref/build/text | ||
| mv ./api-ref/build/html ./api-ref/build/text | ||
|
|
||
| # Convert HTML to text | ||
| while read -r html_file; do | ||
| text_file="${html_file%.html}.txt" | ||
| [ -e "$html_file" ] && html2text "$html_file" utf8 > "$text_file" | ||
| done <<< "$(find ./api-ref/build/text -name "*.html")" | ||
|
|
||
| # Cleanup | ||
| find ./api-ref/build/text -type f ! -name "*.txt" -delete | ||
| find ./api-ref/build/text -mindepth 1 -depth -type d -empty -delete | ||
|
|
||
| # Remove unpublished metadata (JIRA OSPRH-19255 requirement #1) | ||
| find ./api-ref/build/text -name "genindex.txt" -delete | ||
| find ./api-ref/build/text -name "search.txt" -delete | ||
| find ./api-ref/build/text -path "*/_sources/*" -delete | ||
| find ./api-ref/build/text -type d -name "_sources" -delete | ||
|
|
||
| # index.txt and api_microversion_history.txt handling to prevent unreachable URLs | ||
| api_file_count=$(find ./api-ref/build/text -name "*.txt" \ | ||
| ! -name "index.txt" ! -name "genindex.txt" \ | ||
| ! -name "search.txt" ! -name "api_microversion_history.txt" \ | ||
| -type f | wc -l) | ||
|
|
||
| if [ "$api_file_count" -gt 0 ]; then | ||
| # Has real API files - remove navigation files | ||
| find ./api-ref/build/text -name "index.txt" -delete | ||
| find ./api-ref/build/text -name "api_microversion_history.txt" -delete | ||
| else | ||
| # Only has index.txt - skip to avoid unreachable URLs | ||
| echo "Skipping API-Ref for $project (no content files)" | ||
| rm -rf ./api-ref/build/text | ||
| if [ -n "$api_dir" ]; then | ||
| echo "Building API-Ref documentation for $project using $api_dir..." | ||
|
|
||
| if ! grep -q "text-api-ref" tox.ini; then | ||
| # Adjust the target if it's api-guide instead of api-ref | ||
| local adjusted_target | ||
| adjusted_target="${tox_text_api_ref_target//api-ref/$api_dir}" | ||
omkarjoshi0304 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| echo "$adjusted_target" >> tox.ini | ||
| fi | ||
|
|
||
| if ! tox -etext-api-ref; then | ||
| echo "WARNING: API-Ref build failed for $project" | ||
| exit 1 | ||
| fi | ||
|
|
||
| find ./api-ref/build/text -mindepth 1 -depth -type d -empty -delete 2>/dev/null || true | ||
| if [ "$api_ref_failed" != "true" ]; then | ||
| echo "Converting API-Ref HTML to plain text for $project..." | ||
| rm -rf "./$api_dir/build/text" | ||
| mkdir -p "./$api_dir/build/text" | ||
|
|
||
| converted_count=0 | ||
| while IFS= read -r -d '' html_file; do | ||
lpiwowar marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| rel_path="${html_file#./"$api_dir"/build/html/}" | ||
| text_file="./$api_dir/build/text/${rel_path%.html}.txt" | ||
| mkdir -p "$(dirname "$text_file")" | ||
|
|
||
| # Convert HTML to plain text using pandoc (consistent output) | ||
| pandoc -f html -t plain --wrap=preserve "$html_file" -o "$text_file" || { | ||
| echo "ERROR: Failed to convert $html_file" | ||
| return 1 | ||
| } | ||
|
|
||
| converted_count=$((converted_count + 1)) | ||
| done < <(find "./$api_dir/build/html" -name "*.html" -type f -print0) | ||
|
|
||
| echo "Converted $converted_count HTML files to text for $project" | ||
|
|
||
| # Cleanup unwanted files (logos, metadata, empty directories) | ||
| # shellcheck disable=SC2038 | ||
| find "./$api_dir/build/text" -type f -exec grep -l "logo-full.svg" {} + | xargs rm -f 2>/dev/null || true | ||
Akrog marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| find "./$api_dir/build/text" \( -name "genindex.txt" -o -name "search.txt" \) -delete 2>/dev/null || true | ||
| rm -rf "./$api_dir/build/text/_sources" 2>/dev/null || true | ||
|
|
||
| # Check for content (size > 1k) | ||
| api_file_count=$(find "./$api_dir/build/text" -name "*.txt" -type f -size +1k 2>/dev/null | wc -l) | ||
|
|
||
| if [ "$api_file_count" -gt 0 ]; then | ||
| echo "API-Ref: Found $api_file_count content files for $project" | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wouldn't this content calculation message wrong if we remove some files on L273?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. moved the final file count calculation to the very end of the function so the log message accurately reflects the number of files |
||
| else | ||
| echo "Skipping API-Ref for $project (no content found)" | ||
| rm -rf "./$api_dir/build/text" | ||
| fi | ||
| find "./$api_dir/build/text" -mindepth 1 -depth -type d -empty -delete 2>/dev/null || true | ||
| fi | ||
| fi | ||
| [ "${CLEAN_FILES}" == "venv" ] && rm -rf .tox/text-api-ref | ||
| fi | ||
|
|
||
| # These projects have all their docs under "latest" instead of "2025.2" | ||
|
|
@@ -271,12 +288,12 @@ deps = | |
| rm -rf "$project_output_dir" | ||
| mkdir -p "$project_output_dir" | ||
| # Only copy if text docs were built (skipped for neutron-lib) | ||
| [ -d "doc/build/text" ] && cp -r doc/build/text "$project_output_dir"/"$_output_version"_docs | ||
| [ -d "doc/build/text" ] && cp -r doc/build/text "${project_output_dir}/${_output_version}_docs" | ||
|
|
||
| # Copy API-Ref documentation only if OS_API_DOCS is enabled and build succeeded | ||
| if [ "$OS_API_DOCS" = "true" ] && [ -d "./api-ref/source" ] && \ | ||
| [ "$api_ref_failed" != "true" ] && [ -d "api-ref/build/text" ]; then | ||
| cp -r api-ref/build/text "$project_output_dir"/"$_output_version"_api-ref | ||
| # Copy API-Ref documentation if it was built successfully | ||
| if [ "$OS_API_DOCS" = "true" ] && [ "$api_ref_failed" != "true" ] && \ | ||
| [ -n "$api_dir" ] && [ -d "$api_dir/build/text" ]; then | ||
| cp -r "${api_dir}/build/text" "${project_output_dir}/${_output_version}_api-ref" | ||
| echo "API-Ref documentation copied for $project" | ||
| fi | ||
|
|
||
|
|
@@ -308,7 +325,7 @@ for os_project in "${os_projects[@]}"; do | |
| if [ "${num_running_subproc}" -ge "${NUM_WORKERS}" ]; then | ||
| echo "Using ${num_running_subproc}/${NUM_WORKERS} workers. Waiting ..." | ||
| wait -n || log_and_die "Subprocess generating text documentation failed!" | ||
| echo "Using $(( --num_running_subproc ))/${NUM_WORKERS} workers." | ||
| echo "Using $(( --num_running_subproc ))/${NUM_WORKERS} workers." | ||
| fi | ||
| done | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.