diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index de65925..89f6c20 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -614,7 +614,7 @@ If you need to change the website URL: #### Step 1: Update Critical Configuration - [ ] Update `_config.yml` → `url:` field -- [ ] Update `robots.txt` → `Sitemap:` line +- [ ] Verify `robots.txt` → `Sitemap:` line (generated from `{{ site.url }}{{ site.baseurl }}`) - [ ] Update or remove `CNAME` file if using custom domain #### Step 2: Test Locally diff --git a/robots.txt b/robots.txt index ee3c1e8..a59c141 100644 --- a/robots.txt +++ b/robots.txt @@ -13,18 +13,39 @@ permalink: /robots.txt # - Malicious crawlers may ignore this file # - For GitHub Pages, this provides basic protection -# Allow major search engines with rate limiting +# Allow major search engines. +# Note: Googlebot ignores Crawl-delay directives, so we omit it to avoid Search Console warnings. User-agent: Googlebot -Crawl-delay: 10 -Allow: / +Disallow: /images/ +Disallow: /assets/ +Disallow: /_site/ +Disallow: /bin/ +Disallow: /CNAME +Disallow: /README.md +Disallow: /DEVELOPMENT.md +Disallow: /.htaccess User-agent: Bingbot Crawl-delay: 10 -Allow: / +Disallow: /images/ +Disallow: /assets/ +Disallow: /_site/ +Disallow: /bin/ +Disallow: /CNAME +Disallow: /README.md +Disallow: /DEVELOPMENT.md +Disallow: /.htaccess User-agent: Slurp Crawl-delay: 10 -Allow: / +Disallow: /images/ +Disallow: /assets/ +Disallow: /_site/ +Disallow: /bin/ +Disallow: /CNAME +Disallow: /README.md +Disallow: /DEVELOPMENT.md +Disallow: /.htaccess # Block aggressive/problematic crawlers User-agent: MJ12bot @@ -64,18 +85,14 @@ Crawl-delay: 10 Disallow: /images/ Disallow: /assets/ Disallow: /_site/ +Disallow: /bin/ +Disallow: /CNAME +Disallow: /README.md +Disallow: /DEVELOPMENT.md +Disallow: /.htaccess -# Allow access to main pages -Allow: /$ -Allow: /allnews -Allow: /allnews.html -Allow: /team -Allow: /publications -Allow: /contact -Allow: /funding -Allow: /gallery -Allow: /openings -Allow: /sitemap.xml +# Allow access to main pages (everything else is allowed by default) +Allow: / # Sitemap location (helps good crawlers index efficiently) Sitemap: {{ site.url }}{{ site.baseurl }}/sitemap.xml diff --git a/sitemap.xml b/sitemap.xml index 84a0624..830fd66 100644 --- a/sitemap.xml +++ b/sitemap.xml @@ -4,25 +4,17 @@ permalink: /sitemap.xml --- - {% for page in site.pages %} - {% if page.url == nil %} - {% continue %} - {% endif %} - - {% if page.exclude_from_sitemap == true %} - {% continue %} - {% endif %} - - {% if page.url == "/404.html" or page.url == "/sitemap.xml" or page.url == "/robots.txt" %} - {% continue %} - {% endif %} - - {% if page.url contains ".css" or page.url contains ".js" or page.url contains ".xml" or page.url contains ".txt" %} - {% continue %} - {% endif %} + {% assign pages_list = site.pages | where_exp: "p", "p.url != nil" %} + {% for page in pages_list %} + {% if page.exclude_from_sitemap == true %}{% continue %}{% endif %} + {% if page.url == "/404.html" or page.url == "/sitemap.xml" or page.url == "/robots.txt" %}{% continue %}{% endif %} + {% if page.url contains ".css" or page.url contains ".js" or page.url contains ".xml" or page.url contains ".txt" %}{% continue %}{% endif %} {{ site.url }}{{ site.baseurl }}{{ page.url | replace: "index.html", "" }} + {% if page.last_modified_at %} + {{ page.last_modified_at | date_to_xmlschema }} + {% endif %} {% endfor %}