From 0ed34048d4b698fb80b63e42d808bee868ae5003 Mon Sep 17 00:00:00 2001 From: Kinga Kazala <252134343+kinga-altF4@users.noreply.github.com> Date: Sun, 18 Jan 2026 16:19:07 +0100 Subject: [PATCH 1/7] GetUnsuccesfulCrawledUrls --- .../Search/GetUnsuccesfulCrawledUrls.cs | 180 ++++++++++++++++++ 1 file changed, 180 insertions(+) create mode 100644 src/Commands/Search/GetUnsuccesfulCrawledUrls.cs diff --git a/src/Commands/Search/GetUnsuccesfulCrawledUrls.cs b/src/Commands/Search/GetUnsuccesfulCrawledUrls.cs new file mode 100644 index 000000000..86b227089 --- /dev/null +++ b/src/Commands/Search/GetUnsuccesfulCrawledUrls.cs @@ -0,0 +1,180 @@ +ο»Ώusing System; +using System.Collections.Generic; +using System.Linq; +using System.Management.Automation; +using Microsoft.SharePoint.Client; +using Microsoft.SharePoint.Client.Search.Administration; +using PnP.PowerShell.Commands.Attributes; + +namespace PnP.PowerShell.Commands.Search +{ + + [Cmdlet(VerbsCommon.Get, "PnPGetUnsuccesfulCrawledUrls")] + [ApiNotAvailableUnderApplicationPermissions] + public class GetUnsuccesfulCrawledUrls : PnPWebCmdlet + { + [Parameter(Mandatory = false)] + public string Filter; + + [Parameter(Mandatory = false)] + //DateOnly used + public DateTime StartDate = DateTime.MinValue; + + [Parameter(Mandatory = false)] + //DateOnly used + public DateTime EndDate = DateTime.UtcNow.AddDays(1); + + [Parameter(Mandatory = false)] + public SwitchParameter RawFormat; + + [Parameter(Mandatory = false)] + public SwitchParameter IncreaseRequestTimeout; + + + private const int MaxRows = 100000; + + protected override void ExecuteCmdlet() + { + try + { + if(IncreaseRequestTimeout) + { + string timeoutValue = Environment.GetEnvironmentVariable("SharePointPnPHttpTimeout"); + if (string.IsNullOrEmpty(timeoutValue)) + { + LogWarning("The timeout may be only increased if the SharePointPnPHttpTimeout environment variable is set to 180000 or -1."); + LogWarning("Use $env:SharePointPnPHttpTimeout = -1 command and then, establish new connection with Connect-PnPOnline."); + return; + } + else + { + //Max 3 minutes, because Default CSOM timeout is 180,000 ms + ClientContext.RequestTimeout=3*60*1000; + } + } + var crawlLog = new DocumentCrawlLog(ClientContext, ClientContext.Site); + ClientContext.Load(crawlLog); + + + string postFilter = string.Empty; + if (string.IsNullOrWhiteSpace(Filter)) + { + Filter = $"https://{GetHostName()}.sharepoint.{PnP.Framework.AuthenticationManager.GetSharePointDomainSuffix(Connection.AzureEnvironment)}"; + } + + var logEntries = crawlLog.GetUnsuccesfulCrawledUrls(Filter, StartDate, EndDate); + ClientContext.ExecuteQueryRetry(); + + if (RawFormat) + { + var entries = new List(); + foreach (var dictionary in logEntries.Value.Rows) + { + string url = System.Net.WebUtility.UrlDecode(dictionary["FullUrl"].ToString()); + if (string.IsNullOrWhiteSpace(postFilter) || url.Contains(postFilter)) + { + entries.Add(ConvertToPSObject(dictionary)); + } + } + } + else + { + var entries = new List(logEntries.Value.Rows.Count); + foreach (var dictionary in logEntries.Value.Rows) + { + var entry = MapCrawlLogEntry(dictionary); + if (string.IsNullOrWhiteSpace(postFilter) || entry.Url.Contains(postFilter)) + { + entries.Add(entry); + } + } + + WriteObject(entries.OrderByDescending(i => i.CrawlTime).ToList(), true); + } + } + catch (Exception e) + { + if(e.Message=="The operation has timed out." ) + { + + LogError($"Error: {e.Message}. Default CSOM timeout is 180,000 ms (β‰ˆ3 minutes). If you are querying large crawl logs or broad ranges, the server may take longer than that. "); + + } + else + { + LogError($"Error: {e.Message}. Make sure you are granted access to the crawl log via the SharePoint search admin center at https://-admin.sharepoint.com/_layouts/15/searchadmin/crawllogreadpermission.aspx"); + } + } + } + +#region Helper functions + + private string GetHostName() + { + return new Uri(ClientContext.Url).Host.Replace("-admin", "").Replace("-public", "").Replace("-my", "").Replace($".sharepoint.{PnP.Framework.AuthenticationManager.GetSharePointDomainSuffix(Connection.AzureEnvironment)}", ""); + } + + private int GetContentSourceIdForSites(DocumentCrawlLog crawlLog) + { + var hostName = GetHostName(); + var spContent = crawlLog.GetCrawledUrls(false, 10, $"https://{hostName}.sharepoint.{PnP.Framework.AuthenticationManager.GetSharePointDomainSuffix(Connection.AzureEnvironment)}/sites", true, -1, (int)LogLevel.All, -1, DateTime.Now.AddDays(-100), DateTime.Now.AddDays(1)); + ClientContext.ExecuteQueryRetry(); + if (spContent.Value.Rows.Count > 0) return (int)spContent.Value.Rows.First()["ContentSourceID"]; + return -1; + } + + private int GetContentSourceIdForUserProfiles(DocumentCrawlLog crawlLog) + { + var hostName = GetHostName(); + var peopleContent = crawlLog.GetCrawledUrls(false, 100, $"sps3s://{hostName}-my.sharepoint.{PnP.Framework.AuthenticationManager.GetSharePointDomainSuffix(Connection.AzureEnvironment)}", true, -1, (int)LogLevel.All, -1, DateTime.Now.AddDays(-100), DateTime.Now.AddDays(1)); + ClientContext.ExecuteQueryRetry(); + if (peopleContent.Value.Rows.Count > 0) return (int)peopleContent.Value.Rows.First()["ContentSourceID"]; + return -1; + } + + private static CrawlEntry MapCrawlLogEntry(Dictionary dictionary) + { + var entry = new CrawlEntry + { + ItemId = (int)dictionary["DocID"], + ContentSourceId = -1, + Url = dictionary["FullUrl"].ToString(), + CrawlTime = (DateTime)dictionary["TimeStamp"], + LastTouchedTime= (DateTime)dictionary["LastTouchedTime"], + DatabaseName= (string)dictionary["DatabaseName"] + }; + long.TryParse(dictionary["LastRepositoryModifiedTime"] + "", out long ticks); + if (ticks != 0) + { + var itemDate = DateTime.FromFileTimeUtc(ticks); + entry.ItemTime = itemDate; + } + entry.LogLevel = + (LogLevel)Enum.Parse(typeof(LogLevel), dictionary["ErrorLevel"].ToString()); + + + entry.Status = dictionary["StatusMessage"] + ""; + entry.Status += dictionary["ErrorDesc"] + ""; + var errorCode = int.Parse(dictionary["ErrorCode"]+""); + if (!string.IsNullOrWhiteSpace(entry.Status) || errorCode != 0) + { + entry.LogLevel = LogLevel.Warning; + } + return entry; + } + + private object ConvertToPSObject(IDictionary r) + { + PSObject res = new PSObject(); + if (r != null) + { + foreach (var kvp in r) + { + res.Properties.Add(new PSNoteProperty(kvp.Key, kvp.Value)); + } + } + return res; + } +#endregion + } +} From 1a3012267789f7eecaddfa15a27a4e11c31e999a Mon Sep 17 00:00:00 2001 From: Kinga Kazala <252134343+kinga-altF4@users.noreply.github.com> Date: Sun, 18 Jan 2026 17:05:07 +0100 Subject: [PATCH 2/7] Add PnPGetUnsuccesfulCrawledUrls command --- .../Search/GetUnsuccesfulCrawledUrls.cs | 33 ++++++++++--------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/src/Commands/Search/GetUnsuccesfulCrawledUrls.cs b/src/Commands/Search/GetUnsuccesfulCrawledUrls.cs index 86b227089..e3dc3373d 100644 --- a/src/Commands/Search/GetUnsuccesfulCrawledUrls.cs +++ b/src/Commands/Search/GetUnsuccesfulCrawledUrls.cs @@ -8,6 +8,17 @@ namespace PnP.PowerShell.Commands.Search { + public class UnsuccesfullCrawlEntry + { + public string Url { get; set; } + public DateTime CrawlTime { get; set; } + public DateTime ItemTime { get; set; } + public string Status { get; set; } + public int ErrorCode { get; set; } + public int ItemId { get; set; } + public DateTime LastTouchedTime { get; set; } + public string DatabaseName { get; set; } + } [Cmdlet(VerbsCommon.Get, "PnPGetUnsuccesfulCrawledUrls")] [ApiNotAvailableUnderApplicationPermissions] @@ -79,7 +90,7 @@ protected override void ExecuteCmdlet() } else { - var entries = new List(logEntries.Value.Rows.Count); + var entries = new List(logEntries.Value.Rows.Count); foreach (var dictionary in logEntries.Value.Rows) { var entry = MapCrawlLogEntry(dictionary); @@ -132,34 +143,26 @@ private int GetContentSourceIdForUserProfiles(DocumentCrawlLog crawlLog) return -1; } - private static CrawlEntry MapCrawlLogEntry(Dictionary dictionary) + private static UnsuccesfullCrawlEntry MapCrawlLogEntry(Dictionary dictionary) { - var entry = new CrawlEntry + var entry = new UnsuccesfullCrawlEntry { ItemId = (int)dictionary["DocID"], - ContentSourceId = -1, Url = dictionary["FullUrl"].ToString(), CrawlTime = (DateTime)dictionary["TimeStamp"], LastTouchedTime= (DateTime)dictionary["LastTouchedTime"], DatabaseName= (string)dictionary["DatabaseName"] }; - long.TryParse(dictionary["LastRepositoryModifiedTime"] + "", out long ticks); + var time=dictionary["SPItemModifiedTime"]+"" ?? dictionary["LastModifiedTime"]+"" ??""; + long.TryParse(time, out long ticks); if (ticks != 0) { var itemDate = DateTime.FromFileTimeUtc(ticks); entry.ItemTime = itemDate; } - entry.LogLevel = - (LogLevel)Enum.Parse(typeof(LogLevel), dictionary["ErrorLevel"].ToString()); - - entry.Status = dictionary["StatusMessage"] + ""; - entry.Status += dictionary["ErrorDesc"] + ""; - var errorCode = int.Parse(dictionary["ErrorCode"]+""); - if (!string.IsNullOrWhiteSpace(entry.Status) || errorCode != 0) - { - entry.LogLevel = LogLevel.Warning; - } + entry.Status = (dictionary["ErrorDesc"]??"").ToString(); + entry.ErrorCode = int.Parse(dictionary["ErrorCode"]+""); return entry; } From c9c73fd92007fde520b798e3869541b25c32c668 Mon Sep 17 00:00:00 2001 From: Kinga Kazala <252134343+kinga-altF4@users.noreply.github.com> Date: Sun, 18 Jan 2026 17:40:07 +0100 Subject: [PATCH 3/7] Documentation page for Get-PnPGetUnsuccesfulCrawledUrls --- .../Get-PnPGetUnsuccesfulCrawledUrls.md | 173 ++++++++++++++++++ 1 file changed, 173 insertions(+) create mode 100644 documentation/Get-PnPGetUnsuccesfulCrawledUrls.md diff --git a/documentation/Get-PnPGetUnsuccesfulCrawledUrls.md b/documentation/Get-PnPGetUnsuccesfulCrawledUrls.md new file mode 100644 index 000000000..0e14325f8 --- /dev/null +++ b/documentation/Get-PnPGetUnsuccesfulCrawledUrls.md @@ -0,0 +1,173 @@ +--- +Module Name: PnP.PowerShell +schema: 2.0.0 +applicable: SharePoint Online +online version: https://pnp.github.io/powershell/cmdlets/Get-PnPFooter.html +external help file: PnP.PowerShell.dll-Help.xml +title: Get-PnPFooter +--- + +# Get-PnPGetUnsuccesfulCrawledUrls + + +## SYNOPSIS + +Retrieve a list of URLs that failed to be indexed during a search crawl, which is useful for diagnosing search issues. +> Make sure you are granted access to the crawl log via the SharePoint search admin center at https://-admin.sharepoint.com/_layouts/15/searchadmin/crawllogreadpermission.aspx in order to run this cmdlet. + +## SYNTAX + +```powershell +Get-PnPGetUnsuccesfulCrawledUrls [-Filter ] [-StartDate ] [-EndDate ] [-RawFormat] + [-IncreaseRequestTimeout] + [-Connection ] +``` + +## DESCRIPTION + +Enables retrieval of items that failed to be indexed during a search crawl. This is particularly useful when processing large lists or libraries and encountering request timeouts. By focusing exclusively on errors, you can reliably identify issues without additional effort to narrow the query scope. + +> This command relies on `DocumentCrawlLog.GetUnsuccesfulCrawledUrls` undocumented method. + +### EXAMPLE 1 +```powershell +Get-PnPGetUnsuccesfulCrawledUrls +``` + +Returns all (?) crawl log errors for site content. During tests, more than 3000 items were returned. + +### EXAMPLE 2 +```powershell +Get-PnPGetUnsuccesfulCrawledUrls -Filter "https://contoso-my.sharepoint.com/sites/Intranet" +``` +Returns all (?) crawl log errors for the specified site. + +### EXAMPLE 3 +```powershell +Get-PnPGetUnsuccesfulCrawledUrls -StartDate (Get-Date).AddDays(-10) +``` + +Returns all (?) crawl log errors, starting from 10 days ago. + +> Based on the author's test results and Copilot's input πŸ˜‰, the `DocumentCrawlLog` methods don't respect the time component in `StartDate` and `EndDate`. They only use the date portion for filtering. Internally, the crawl log is grouped by crawl day, so any hour/minute you provide is ignored. The CSOM API (GetCrawledUrls) accepts DateTime values, but the backend partitions data by date, not timestamp. + +### EXAMPLE 4 +```powershell +$ClientID= "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" +$env:SharePointPnPHttpTimeout = -1 #πŸ‘ˆ + +Connect-PnPOnline -Url https://-admin.sharepoint.com/ -Interactive -ClientId $ClientID -ErrorAction Stop # πŸ‘ˆ + +Get-PnPGetUnsuccesfulCrawledUrls -Filter "https://contoso-my.sharepoint.com/sites/Intranet" +``` + +Increases the Request Timeout allowing the call to last up to 3 minutes. The `ClientRuntimeContext` enforces a three-minute limit; when increasing the timeout to its maximum of three minutes, this threshold may still be exceeded. + + +## PARAMETERS + +### -Connection +Optional connection to be used by the cmdlet. Retrieve the value for this parameter by either specifying -ReturnConnection on Connect-PnPOnline or by executing Get-PnPConnection. + +```yaml +Type: PnPConnection +Parameter Sets: (All) + +Required: False +Position: Named +Default value: None +Accept pipeline input: False +Accept wildcard characters: False +``` + + +### -EndDate +End date to stop getting entries from. Default to current time. + +```yaml +Type: DateTime +Parameter Sets: (All) + +Required: False +Position: Named +Default value: None +Accept pipeline input: False +Accept wildcard characters: False +``` + +### -Filter +Filter to limit what is being returned. Has to be a URL prefix for SharePoint content. Wildcard characters are not supported. + +```yaml +Type: String +Parameter Sets: (All) + +Required: False +Position: Named +Default value: None +Accept pipeline input: False +Accept wildcard characters: False +``` + + +### -RawFormat +Show raw crawl log data + +```yaml +Type: SwitchParameter +Parameter Sets: (All) + +Required: False +Position: Named +Default value: None +Accept pipeline input: False +Accept wildcard characters: False +``` + + +### -StartDate +Start date to start getting entries from. Defaults to start of time. + +```yaml +Type: DateTime +Parameter Sets: (All) + +Required: False +Position: Named +Default value: None +Accept pipeline input: False +Accept wildcard characters: False +``` + +### -IncreaseRequestTimeout + +```yaml +Type: Switch +Parameter Sets: (All) + +Required: False +Position: Named +Default value: None +Accept pipeline input: False +Accept wildcard characters: False +``` + +Increases timeout to maximum 3 minutes. +The `ClientRuntimeContext` enforces a three-minute limit; when increasing the timeout to its maximum of three minutes, this threshold may still be exceeded. + +> Note: Before running Get-PnPUnsuccessfulCrawledUrls with -IncreaseRequestTimeout, you must set $env:SharePointPnPHttpTimeout = -1 to remove the default HttpClient timeout. Then establish a new PnP connection because the environment variable is only applied when the session initializes. + +```powershell +$ClientID= "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" +$env:SharePointPnPHttpTimeout = -1 + +Connect-PnPOnline -Url https://-admin.sharepoint.com/ -Interactive -ClientId $ClientID -ErrorAction Stop + +$scope="https://contoso-my.sharepoint.com/sites/Intranet" +Get-PnPGetUnsuccesfulCrawledUrls -Filter $scope +``` + +## RELATED LINKS + +[Microsoft 365 Patterns and Practices](https://aka.ms/m365pnp) + From cfa95748557fecd7c9ccc85335c954dc3374fb63 Mon Sep 17 00:00:00 2001 From: Kinga Kazala <252134343+kinga-altF4@users.noreply.github.com> Date: Sat, 24 Jan 2026 16:39:07 +0100 Subject: [PATCH 4/7] readme --- .../Get-PnPGetUnsuccesfulCrawledUrls.md | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/documentation/Get-PnPGetUnsuccesfulCrawledUrls.md b/documentation/Get-PnPGetUnsuccesfulCrawledUrls.md index 0e14325f8..4cd2e8f26 100644 --- a/documentation/Get-PnPGetUnsuccesfulCrawledUrls.md +++ b/documentation/Get-PnPGetUnsuccesfulCrawledUrls.md @@ -27,29 +27,28 @@ Get-PnPGetUnsuccesfulCrawledUrls [-Filter ] [-StartDate ] [-E Enables retrieval of items that failed to be indexed during a search crawl. This is particularly useful when processing large lists or libraries and encountering request timeouts. By focusing exclusively on errors, you can reliably identify issues without additional effort to narrow the query scope. -> This command relies on `DocumentCrawlLog.GetUnsuccesfulCrawledUrls` undocumented method. - ### EXAMPLE 1 ```powershell Get-PnPGetUnsuccesfulCrawledUrls ``` -Returns all (?) crawl log errors for site content. During tests, more than 3000 items were returned. +Returns all crawler log warnings and errors for site content. The amount of returned entries is limited by the request timeout. ### EXAMPLE 2 ```powershell Get-PnPGetUnsuccesfulCrawledUrls -Filter "https://contoso-my.sharepoint.com/sites/Intranet" ``` -Returns all (?) crawl log errors for the specified site. +Returns crawl log warnings and errors for the specified site. The amount of returned entries is limited by the request timeout. ### EXAMPLE 3 ```powershell Get-PnPGetUnsuccesfulCrawledUrls -StartDate (Get-Date).AddDays(-10) ``` -Returns all (?) crawl log errors, starting from 10 days ago. +Returns crawler log warnings and errors, starting from 10 days ago. -> Based on the author's test results and Copilot's input πŸ˜‰, the `DocumentCrawlLog` methods don't respect the time component in `StartDate` and `EndDate`. They only use the date portion for filtering. Internally, the crawl log is grouped by crawl day, so any hour/minute you provide is ignored. The CSOM API (GetCrawledUrls) accepts DateTime values, but the backend partitions data by date, not timestamp. +> Based on the author's test results and Copilot's input πŸ˜‰, the `DocumentCrawlLog` methods don't respect the __time__ component in `StartDate` and `EndDate`. They only __use the date__ portion for filtering. Internally, the crawl log is grouped by crawl day, so any hour/minute you provide is ignored. +The CSOM API (GetCrawledUrls) accepts DateTime values, but the backend partitions data by date, not timestamp. ### EXAMPLE 4 ```powershell @@ -58,10 +57,10 @@ $env:SharePointPnPHttpTimeout = -1 #πŸ‘ˆ Connect-PnPOnline -Url https://-admin.sharepoint.com/ -Interactive -ClientId $ClientID -ErrorAction Stop # πŸ‘ˆ -Get-PnPGetUnsuccesfulCrawledUrls -Filter "https://contoso-my.sharepoint.com/sites/Intranet" +Get-PnPGetUnsuccesfulCrawledUrls -Filter "https://contoso-my.sharepoint.com/sites/Intranet" -IncreaseRequestTimeout ``` -Increases the Request Timeout allowing the call to last up to 3 minutes. The `ClientRuntimeContext` enforces a three-minute limit; when increasing the timeout to its maximum of three minutes, this threshold may still be exceeded. +Increases the request timeout allowing the call to last up to 3 minutes. The `ClientRuntimeContext` enforces a three-minute limit; when increasing the timeout to its maximum of three minutes, this threshold may still be exceeded. ## PARAMETERS @@ -82,7 +81,7 @@ Accept wildcard characters: False ### -EndDate -End date to stop getting entries from. Default to current time. +End date to stop getting entries from. Defaults to current time. ```yaml Type: DateTime @@ -109,12 +108,10 @@ Accept pipeline input: False Accept wildcard characters: False ``` - -### -RawFormat -Show raw crawl log data +### -IncreaseRequestTimeout ```yaml -Type: SwitchParameter +Type: Switch Parameter Sets: (All) Required: False @@ -124,12 +121,11 @@ Accept pipeline input: False Accept wildcard characters: False ``` - -### -StartDate -Start date to start getting entries from. Defaults to start of time. +### -RawFormat +Show raw crawl log data ```yaml -Type: DateTime +Type: SwitchParameter Parameter Sets: (All) Required: False @@ -139,10 +135,12 @@ Accept pipeline input: False Accept wildcard characters: False ``` -### -IncreaseRequestTimeout + +### -StartDate +Start date to start getting entries from. Defaults to start of time. ```yaml -Type: Switch +Type: DateTime Parameter Sets: (All) Required: False @@ -152,6 +150,8 @@ Accept pipeline input: False Accept wildcard characters: False ``` + + Increases timeout to maximum 3 minutes. The `ClientRuntimeContext` enforces a three-minute limit; when increasing the timeout to its maximum of three minutes, this threshold may still be exceeded. From 4ecbfbbe58a99471707263814280ecba339485ea Mon Sep 17 00:00:00 2001 From: Kinga Kazala <252134343+kinga-altF4@users.noreply.github.com> Date: Sat, 24 Jan 2026 16:50:08 +0100 Subject: [PATCH 5/7] correct command name --- .../Get-PnPGetUnsuccesfulCrawledUrls.md | 173 ------------------ .../Search/GetUnsuccesfulCrawledUrls.cs | 2 +- 2 files changed, 1 insertion(+), 174 deletions(-) delete mode 100644 documentation/Get-PnPGetUnsuccesfulCrawledUrls.md diff --git a/documentation/Get-PnPGetUnsuccesfulCrawledUrls.md b/documentation/Get-PnPGetUnsuccesfulCrawledUrls.md deleted file mode 100644 index 4cd2e8f26..000000000 --- a/documentation/Get-PnPGetUnsuccesfulCrawledUrls.md +++ /dev/null @@ -1,173 +0,0 @@ ---- -Module Name: PnP.PowerShell -schema: 2.0.0 -applicable: SharePoint Online -online version: https://pnp.github.io/powershell/cmdlets/Get-PnPFooter.html -external help file: PnP.PowerShell.dll-Help.xml -title: Get-PnPFooter ---- - -# Get-PnPGetUnsuccesfulCrawledUrls - - -## SYNOPSIS - -Retrieve a list of URLs that failed to be indexed during a search crawl, which is useful for diagnosing search issues. -> Make sure you are granted access to the crawl log via the SharePoint search admin center at https://-admin.sharepoint.com/_layouts/15/searchadmin/crawllogreadpermission.aspx in order to run this cmdlet. - -## SYNTAX - -```powershell -Get-PnPGetUnsuccesfulCrawledUrls [-Filter ] [-StartDate ] [-EndDate ] [-RawFormat] - [-IncreaseRequestTimeout] - [-Connection ] -``` - -## DESCRIPTION - -Enables retrieval of items that failed to be indexed during a search crawl. This is particularly useful when processing large lists or libraries and encountering request timeouts. By focusing exclusively on errors, you can reliably identify issues without additional effort to narrow the query scope. - -### EXAMPLE 1 -```powershell -Get-PnPGetUnsuccesfulCrawledUrls -``` - -Returns all crawler log warnings and errors for site content. The amount of returned entries is limited by the request timeout. - -### EXAMPLE 2 -```powershell -Get-PnPGetUnsuccesfulCrawledUrls -Filter "https://contoso-my.sharepoint.com/sites/Intranet" -``` -Returns crawl log warnings and errors for the specified site. The amount of returned entries is limited by the request timeout. - -### EXAMPLE 3 -```powershell -Get-PnPGetUnsuccesfulCrawledUrls -StartDate (Get-Date).AddDays(-10) -``` - -Returns crawler log warnings and errors, starting from 10 days ago. - -> Based on the author's test results and Copilot's input πŸ˜‰, the `DocumentCrawlLog` methods don't respect the __time__ component in `StartDate` and `EndDate`. They only __use the date__ portion for filtering. Internally, the crawl log is grouped by crawl day, so any hour/minute you provide is ignored. -The CSOM API (GetCrawledUrls) accepts DateTime values, but the backend partitions data by date, not timestamp. - -### EXAMPLE 4 -```powershell -$ClientID= "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" -$env:SharePointPnPHttpTimeout = -1 #πŸ‘ˆ - -Connect-PnPOnline -Url https://-admin.sharepoint.com/ -Interactive -ClientId $ClientID -ErrorAction Stop # πŸ‘ˆ - -Get-PnPGetUnsuccesfulCrawledUrls -Filter "https://contoso-my.sharepoint.com/sites/Intranet" -IncreaseRequestTimeout -``` - -Increases the request timeout allowing the call to last up to 3 minutes. The `ClientRuntimeContext` enforces a three-minute limit; when increasing the timeout to its maximum of three minutes, this threshold may still be exceeded. - - -## PARAMETERS - -### -Connection -Optional connection to be used by the cmdlet. Retrieve the value for this parameter by either specifying -ReturnConnection on Connect-PnPOnline or by executing Get-PnPConnection. - -```yaml -Type: PnPConnection -Parameter Sets: (All) - -Required: False -Position: Named -Default value: None -Accept pipeline input: False -Accept wildcard characters: False -``` - - -### -EndDate -End date to stop getting entries from. Defaults to current time. - -```yaml -Type: DateTime -Parameter Sets: (All) - -Required: False -Position: Named -Default value: None -Accept pipeline input: False -Accept wildcard characters: False -``` - -### -Filter -Filter to limit what is being returned. Has to be a URL prefix for SharePoint content. Wildcard characters are not supported. - -```yaml -Type: String -Parameter Sets: (All) - -Required: False -Position: Named -Default value: None -Accept pipeline input: False -Accept wildcard characters: False -``` - -### -IncreaseRequestTimeout - -```yaml -Type: Switch -Parameter Sets: (All) - -Required: False -Position: Named -Default value: None -Accept pipeline input: False -Accept wildcard characters: False -``` - -### -RawFormat -Show raw crawl log data - -```yaml -Type: SwitchParameter -Parameter Sets: (All) - -Required: False -Position: Named -Default value: None -Accept pipeline input: False -Accept wildcard characters: False -``` - - -### -StartDate -Start date to start getting entries from. Defaults to start of time. - -```yaml -Type: DateTime -Parameter Sets: (All) - -Required: False -Position: Named -Default value: None -Accept pipeline input: False -Accept wildcard characters: False -``` - - - -Increases timeout to maximum 3 minutes. -The `ClientRuntimeContext` enforces a three-minute limit; when increasing the timeout to its maximum of three minutes, this threshold may still be exceeded. - -> Note: Before running Get-PnPUnsuccessfulCrawledUrls with -IncreaseRequestTimeout, you must set $env:SharePointPnPHttpTimeout = -1 to remove the default HttpClient timeout. Then establish a new PnP connection because the environment variable is only applied when the session initializes. - -```powershell -$ClientID= "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" -$env:SharePointPnPHttpTimeout = -1 - -Connect-PnPOnline -Url https://-admin.sharepoint.com/ -Interactive -ClientId $ClientID -ErrorAction Stop - -$scope="https://contoso-my.sharepoint.com/sites/Intranet" -Get-PnPGetUnsuccesfulCrawledUrls -Filter $scope -``` - -## RELATED LINKS - -[Microsoft 365 Patterns and Practices](https://aka.ms/m365pnp) - diff --git a/src/Commands/Search/GetUnsuccesfulCrawledUrls.cs b/src/Commands/Search/GetUnsuccesfulCrawledUrls.cs index e3dc3373d..53336a3f7 100644 --- a/src/Commands/Search/GetUnsuccesfulCrawledUrls.cs +++ b/src/Commands/Search/GetUnsuccesfulCrawledUrls.cs @@ -20,7 +20,7 @@ public class UnsuccesfullCrawlEntry public string DatabaseName { get; set; } } - [Cmdlet(VerbsCommon.Get, "PnPGetUnsuccesfulCrawledUrls")] + [Cmdlet(VerbsCommon.Get, "PnPUnsuccesfulCrawledUrls")] [ApiNotAvailableUnderApplicationPermissions] public class GetUnsuccesfulCrawledUrls : PnPWebCmdlet { From b092d1f9447428772f852c5ca67d8ffe74c5c38d Mon Sep 17 00:00:00 2001 From: Kinga Kazala <252134343+kinga-altF4@users.noreply.github.com> Date: Sat, 24 Jan 2026 16:50:34 +0100 Subject: [PATCH 6/7] readme --- .../Get-PnPUnsuccesfulCrawledUrls.md | 173 ++++++++++++++++++ 1 file changed, 173 insertions(+) create mode 100644 documentation/Get-PnPUnsuccesfulCrawledUrls.md diff --git a/documentation/Get-PnPUnsuccesfulCrawledUrls.md b/documentation/Get-PnPUnsuccesfulCrawledUrls.md new file mode 100644 index 000000000..26cb0afc5 --- /dev/null +++ b/documentation/Get-PnPUnsuccesfulCrawledUrls.md @@ -0,0 +1,173 @@ +--- +Module Name: PnP.PowerShell +schema: 2.0.0 +applicable: SharePoint Online +online version: https://pnp.github.io/powershell/cmdlets/Get-PnPFooter.html +external help file: PnP.PowerShell.dll-Help.xml +title: Get-PnPFooter +--- + +# Get-PnPUnsuccesfulCrawledUrls + + +## SYNOPSIS + +Retrieve a list of URLs that failed to be indexed during a search crawl, which is useful for diagnosing search issues. +> Make sure you are granted access to the crawl log via the SharePoint search admin center at https://-admin.sharepoint.com/_layouts/15/searchadmin/crawllogreadpermission.aspx in order to run this cmdlet. + +## SYNTAX + +```powershell +Get-PnPUnsuccesfulCrawledUrls [-Filter ] [-StartDate ] [-EndDate ] [-RawFormat] + [-IncreaseRequestTimeout] + [-Connection ] +``` + +## DESCRIPTION + +Enables retrieval of items that failed to be indexed during a search crawl. This is particularly useful when processing large lists or libraries and encountering request timeouts. By focusing exclusively on errors, you can reliably identify issues without additional effort to narrow the query scope. + +### EXAMPLE 1 +```powershell +Get-PnPUnsuccesfulCrawledUrls +``` + +Returns all crawler log warnings and errors for site content. The amount of returned entries is limited by the request timeout. + +### EXAMPLE 2 +```powershell +Get-PnPUnsuccesfulCrawledUrls -Filter "https://contoso-my.sharepoint.com/sites/Intranet" +``` +Returns crawl log warnings and errors for the specified site. The amount of returned entries is limited by the request timeout. + +### EXAMPLE 3 +```powershell +Get-PnPUnsuccesfulCrawledUrls -StartDate (Get-Date).AddDays(-10) +``` + +Returns crawler log warnings and errors, starting from 10 days ago. + +> Based on the author's test results and Copilot's input πŸ˜‰, the `DocumentCrawlLog` methods don't respect the __time__ component in `StartDate` and `EndDate`. They only __use the date__ portion for filtering. Internally, the crawl log is grouped by crawl day, so any hour/minute you provide is ignored. +The CSOM API (GetCrawledUrls) accepts DateTime values, but the backend partitions data by date, not timestamp. + +### EXAMPLE 4 +```powershell +$ClientID= "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" +$env:SharePointPnPHttpTimeout = -1 #πŸ‘ˆ + +Connect-PnPOnline -Url https://-admin.sharepoint.com/ -Interactive -ClientId $ClientID -ErrorAction Stop # πŸ‘ˆ + +Get-PnPUnsuccesfulCrawledUrls -Filter "https://contoso-my.sharepoint.com/sites/Intranet" -IncreaseRequestTimeout +``` + +Increases the request timeout allowing the call to last up to 3 minutes. The `ClientRuntimeContext` enforces a three-minute limit; when increasing the timeout to its maximum of three minutes, this threshold may still be exceeded. + + +## PARAMETERS + +### -Connection +Optional connection to be used by the cmdlet. Retrieve the value for this parameter by either specifying -ReturnConnection on Connect-PnPOnline or by executing Get-PnPConnection. + +```yaml +Type: PnPConnection +Parameter Sets: (All) + +Required: False +Position: Named +Default value: None +Accept pipeline input: False +Accept wildcard characters: False +``` + + +### -EndDate +End date to stop getting entries from. Defaults to current time. + +```yaml +Type: DateTime +Parameter Sets: (All) + +Required: False +Position: Named +Default value: None +Accept pipeline input: False +Accept wildcard characters: False +``` + +### -Filter +Filter to limit what is being returned. Has to be a URL prefix for SharePoint content. Wildcard characters are not supported. + +```yaml +Type: String +Parameter Sets: (All) + +Required: False +Position: Named +Default value: None +Accept pipeline input: False +Accept wildcard characters: False +``` + +### -IncreaseRequestTimeout + +```yaml +Type: Switch +Parameter Sets: (All) + +Required: False +Position: Named +Default value: None +Accept pipeline input: False +Accept wildcard characters: False +``` + +### -RawFormat +Show raw crawl log data + +```yaml +Type: SwitchParameter +Parameter Sets: (All) + +Required: False +Position: Named +Default value: None +Accept pipeline input: False +Accept wildcard characters: False +``` + + +### -StartDate +Start date to start getting entries from. Defaults to start of time. + +```yaml +Type: DateTime +Parameter Sets: (All) + +Required: False +Position: Named +Default value: None +Accept pipeline input: False +Accept wildcard characters: False +``` + + + +Increases timeout to maximum 3 minutes. +The `ClientRuntimeContext` enforces a three-minute limit; when increasing the timeout to its maximum of three minutes, this threshold may still be exceeded. + +> Note: Before running Get-PnPUnsuccessfulCrawledUrls with -IncreaseRequestTimeout, you must set $env:SharePointPnPHttpTimeout = -1 to remove the default HttpClient timeout. Then establish a new PnP connection because the environment variable is only applied when the session initializes. + +```powershell +$ClientID= "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" +$env:SharePointPnPHttpTimeout = -1 + +Connect-PnPOnline -Url https://-admin.sharepoint.com/ -Interactive -ClientId $ClientID -ErrorAction Stop + +$scope="https://contoso-my.sharepoint.com/sites/Intranet" +Get-PnPUnsuccesfulCrawledUrls -Filter $scope +``` + +## RELATED LINKS + +[Microsoft 365 Patterns and Practices](https://aka.ms/m365pnp) + From ba4308c476b46e4fac11bb144d64a701678c2167 Mon Sep 17 00:00:00 2001 From: Kinga Kazala <252134343+kinga-altF4@users.noreply.github.com> Date: Sat, 24 Jan 2026 17:51:57 +0100 Subject: [PATCH 7/7] tests --- .../Search/GetUnsuccesfulCrawledUrlsTests.cs | 89 +++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 src/Tests/Search/GetUnsuccesfulCrawledUrlsTests.cs diff --git a/src/Tests/Search/GetUnsuccesfulCrawledUrlsTests.cs b/src/Tests/Search/GetUnsuccesfulCrawledUrlsTests.cs new file mode 100644 index 000000000..40835ae88 --- /dev/null +++ b/src/Tests/Search/GetUnsuccesfulCrawledUrlsTests.cs @@ -0,0 +1,89 @@ +using System; +using Microsoft.VisualStudio.TestTools.UnitTesting; +using System.Management.Automation.Runspaces; + +namespace PnP.PowerShell.Tests.Search +{ + [TestClass] + public class GetUnsuccesfulCrawledUrlsTests + { + #region Test Setup/CleanUp + [ClassInitialize] + public static void Initialize(TestContext testContext) + { + // This runs on class level once before all tests run + //using (var ctx = TestCommon.CreateClientContext()) + //{ + //} + } + + [ClassCleanup] + public static void Cleanup(TestContext testContext) + { + // This runs on class level once + //using (var ctx = TestCommon.CreateClientContext()) + //{ + //} + } + + [TestInitialize] + public void Initialize() + { + using (var scope = new PSTestScope()) + { + // Example + // scope.ExecuteCommand("cmdlet", new CommandParameter("param1", prop)); + } + } + + [TestCleanup] + public void Cleanup() + { + using (var scope = new PSTestScope()) + { + try + { + // Do Test Setup - Note, this runs PER test + } + catch (Exception) + { + // Describe Exception + } + } + } + #endregion + + #region Scaffolded Cmdlet Tests + //TODO: This is a scaffold of the cmdlet - complete the unit test + //[TestMethod] + public void GetPnPUnsuccesfulCrawledUrlsTest() + { + using (var scope = new PSTestScope(true)) + { + // Complete writing cmd parameters + + // From Cmdlet Help: Filter to limit what is being returned. Has to be a URL prefix for SharePoint content, and part of a user principal name for user profiles. Wildcard characters are not supported. + var filter = ""; + // From Cmdlet Help: Start date to start getting entries from. Defaults to start of time. + var startDate = ""; + // From Cmdlet Help: End date to stop getting entries from. Default to current time. + var endDate = ""; + // From Cmdlet Help: Show raw crawl log data + var rawFormat = ""; + // From Cmdlet Help: Increases the request timeout for this command to accommodate large result sets + var increaseRequestTimeout = ""; + + var results = scope.ExecuteCommand("Get-PnPUnsuccesfulCrawledUrls", + new CommandParameter("Filter", filter), + new CommandParameter("StartDate", startDate), + new CommandParameter("EndDate", endDate), + new CommandParameter("RawFormat", rawFormat)); + new CommandParameter("IncreaseRequestTimeout", increaseRequestTimeout); + + Assert.IsNotNull(results); + } + } + #endregion + } +} + \ No newline at end of file