99import pandas as pd
1010import re
1111from collections .abc import Callable , Iterable
12- from datetime import date , datetime , timezone
12+ from datetime import date , datetime , time , timezone
1313from google .cloud import storage
1414from schemas .aem import SURVEY_METADATA , IngestConfig
15+ from services .aem_parsers .common import (
16+ TEMPORAL_DATETIME_COLUMNS ,
17+ TEMPORAL_TIME_COLUMNS ,
18+ )
1519from services .util import transform_srid
1620from shapely .geometry import box , mapping
17- from urllib .parse import quote_plus , urlencode
21+ from urllib .parse import quote , quote_plus , urlencode
1822
1923logger = logging .getLogger (__name__ )
2024
@@ -42,8 +46,75 @@ def _stac_datetime_or_none(value) -> str | None:
4246 return f"{ text_value } Z"
4347
4448
49+ def _combine_date_and_time (date_value , time_value ):
50+ if date_value is None or pd .isna (date_value ):
51+ return None
52+ if time_value is None or pd .isna (time_value ):
53+ return date_value
54+
55+ if isinstance (date_value , pd .Timestamp ):
56+ date_value = date_value .to_pydatetime ()
57+ if isinstance (time_value , pd .Timestamp ):
58+ time_value = time_value .to_pydatetime ()
59+
60+ if isinstance (date_value , datetime ):
61+ if date_value .time () != datetime .min .time ():
62+ return date_value
63+ date_part = date_value .date ()
64+ elif isinstance (date_value , date ):
65+ date_part = date_value
66+ else :
67+ parsed_date = pd .to_datetime (date_value , errors = "coerce" )
68+ if pd .isna (parsed_date ):
69+ return date_value
70+ date_part = parsed_date .date ()
71+
72+ if isinstance (time_value , datetime ):
73+ time_part = time_value .timetz ()
74+ elif isinstance (time_value , time ):
75+ time_part = time_value
76+ else :
77+ parsed_time = pd .to_datetime (time_value , errors = "coerce" )
78+ if pd .isna (parsed_time ):
79+ return date_value
80+ time_part = parsed_time .timetz ()
81+
82+ return datetime .combine (date_part , time_part )
83+
84+
85+ def _stac_datetimes_from_frame (df : pd .DataFrame ) -> list [str ]:
86+ for col in TEMPORAL_DATETIME_COLUMNS :
87+ if col not in df .columns :
88+ continue
89+ values = [_stac_datetime_or_none (value ) for value in df [col ]]
90+ cleaned = sorted ({value for value in values if value is not None })
91+ if cleaned :
92+ return cleaned
93+
94+ if "date_acquired" not in df .columns :
95+ return []
96+
97+ for time_col in TEMPORAL_TIME_COLUMNS :
98+ if time_col not in df .columns :
99+ continue
100+ values = [
101+ _stac_datetime_or_none (_combine_date_and_time (date_value , time_value ))
102+ for date_value , time_value in zip (
103+ df ["date_acquired" ], df [time_col ], strict = False
104+ )
105+ ]
106+ cleaned = sorted ({value for value in values if value is not None })
107+ if cleaned :
108+ return cleaned
109+
110+ values = [_stac_datetime_or_none (value ) for value in df ["date_acquired" ]]
111+ return sorted ({value for value in values if value is not None })
112+
113+
45114def _gcs_href (bucket : str , path : str ) -> str :
46- return f"gs://{ bucket } /{ path } "
115+ return (
116+ f"https://storage.googleapis.com/{ bucket } /{ quote (path .lstrip ('/' ), safe = '/' )} "
117+ )
47118
48119
49120def _get_env_or_none (name : str ) -> str | None :
@@ -56,12 +127,30 @@ def _build_geoserver_endpoint(
56127 default_path : str ,
57128 override_env_name : str ,
58129) -> str :
130+ def _join_url_path (base_url : str , path : str ) -> str :
131+ normalized_base = base_url .rstrip ("/" )
132+ normalized_path = f"/{ path .lstrip ('/' )} "
133+ if normalized_base .endswith (normalized_path ):
134+ return normalized_base
135+ base_parts = normalized_base .split ("/" )
136+ path_parts = normalized_path .lstrip ("/" ).split ("/" )
137+ overlap = 0
138+ max_overlap = min (len (base_parts ), len (path_parts ))
139+ for size in range (max_overlap , 0 , - 1 ):
140+ if base_parts [- size :] == path_parts [:size ]:
141+ overlap = size
142+ break
143+ if overlap :
144+ suffix = "/" .join (path_parts [overlap :])
145+ return normalized_base if not suffix else f"{ normalized_base } /{ suffix } "
146+ return f"{ normalized_base } { normalized_path } "
147+
59148 override = _get_env_or_none (override_env_name )
60149 if override is None :
61- return f" { public_url . rstrip ( '/' ) } { default_path } "
150+ return _join_url_path ( public_url , default_path )
62151 if override .startswith ("http://" ) or override .startswith ("https://" ):
63152 return override
64- return f" { public_url . rstrip ( '/' ) } / { override . lstrip ( '/' ) } "
153+ return _join_url_path ( public_url , override )
65154
66155
67156def _geoserver_layer_name (collection_id : str , workspace : str ) -> str :
@@ -158,17 +247,7 @@ def _stac_temporal_extent(
158247 df : pd .DataFrame ,
159248 config : IngestConfig ,
160249) -> tuple [str , str ]:
161- temporal_values = (
162- sorted (
163- {
164- _stac_datetime_or_none (value )
165- for value in df .get ("date_acquired" , pd .Series ())
166- }
167- )
168- if "date_acquired" in df .columns
169- else []
170- )
171- temporal_values = [value for value in temporal_values if value is not None ]
250+ temporal_values = _stac_datetimes_from_frame (df )
172251 if not temporal_values :
173252 fallback = _fallback_stac_datetime (config )
174253 return fallback , fallback
@@ -316,11 +395,7 @@ def build_stac_items(
316395 )
317396 geometry = mapping (geometry_point )
318397 bbox = [round (float (value ), 6 ) for value in geometry_point .bounds ]
319- datetimes = [
320- _stac_datetime_or_none (value )
321- for value in group .get ("date_acquired" , pd .Series ())
322- ]
323- datetimes = sorted ({value for value in datetimes if value is not None })
398+ datetimes = _stac_datetimes_from_frame (group )
324399 if datetimes :
325400 datetime_value = datetimes [0 ] if len (datetimes ) == 1 else None
326401 start_datetime = datetimes [0 ]
0 commit comments