2929from dstack ._internal .core .models .runs import JobSpec , Run , RunSpec , ServiceModelSpec , ServiceSpec
3030from dstack ._internal .server import settings
3131from dstack ._internal .server .models import GatewayModel , JobModel , ProjectModel , RunModel
32+ from dstack ._internal .server .services import events
3233from dstack ._internal .server .services .gateways import (
3334 get_gateway_configuration ,
3435 get_or_add_gateway_connection ,
@@ -114,7 +115,7 @@ async def _register_service_in_gateway(
114115 domain = service_spec .get_domain ()
115116 assert domain is not None
116117
117- conn = await get_or_add_gateway_connection (session , gateway .id )
118+ _ , conn = await get_or_add_gateway_connection (session , gateway .id )
118119 try :
119120 logger .debug ("%s: registering service as %s" , fmt (run_model ), service_spec .url )
120121 async with conn .client () as client :
@@ -131,13 +132,21 @@ async def _register_service_in_gateway(
131132 ssh_private_key = run_model .project .ssh_private_key ,
132133 router = router ,
133134 )
134- logger .info ("%s: service is registered as %s" , fmt (run_model ), service_spec .url )
135135 except SSHError :
136136 raise ServerClientError ("Gateway tunnel is not working" )
137137 except httpx .RequestError as e :
138138 logger .debug ("Gateway request failed" , exc_info = True )
139139 raise GatewayError (f"Gateway is not working: { e !r} " )
140140
141+ events .emit (
142+ session ,
143+ "Service registered in gateway" ,
144+ actor = events .SystemActor (),
145+ targets = [
146+ events .Target .from_model (run_model ),
147+ events .Target .from_model (gateway ),
148+ ],
149+ )
141150 return service_spec
142151
143152
@@ -193,8 +202,9 @@ async def register_replica(
193202 ssh_head_proxy : Optional [SSHConnectionParams ],
194203 ssh_head_proxy_private_key : Optional [str ],
195204):
205+ gateway = None
196206 if gateway_id is not None :
197- conn = await get_or_add_gateway_connection (session , gateway_id )
207+ gateway , conn = await get_or_add_gateway_connection (session , gateway_id )
198208 job_submission = jobs_services .job_model_to_job_submission (job_model )
199209 try :
200210 logger .debug ("%s: registering replica for service %s" , fmt (job_model ), run .id .hex )
@@ -225,17 +235,21 @@ async def register_replica(
225235 else :
226236 raise
227237 job_model .registered = True
228- logger .info (
229- "%s: service replica registered to receive requests, gateway=%s" ,
230- fmt (job_model ),
231- gateway_id is not None ,
238+ targets = [events .Target .from_model (job_model )]
239+ if gateway is not None :
240+ targets .append (events .Target .from_model (gateway ))
241+ events .emit (
242+ session ,
243+ "Service replica registered to receive requests" ,
244+ actor = events .SystemActor (),
245+ targets = targets ,
232246 )
233247
234248
235249async def unregister_service (session : AsyncSession , run_model : RunModel ):
236250 if run_model .gateway_id is None : # in-server proxy
237251 return
238- conn = await get_or_add_gateway_connection (session , run_model .gateway_id )
252+ gateway , conn = await get_or_add_gateway_connection (session , run_model .gateway_id )
239253 res = await session .execute (
240254 select (ProjectModel ).where (ProjectModel .id == run_model .project_id )
241255 )
@@ -247,24 +261,37 @@ async def unregister_service(session: AsyncSession, run_model: RunModel):
247261 project = project .name ,
248262 run_name = run_model .run_name ,
249263 )
250- logger . debug ( "%s: service is unregistered" , fmt ( run_model ))
264+ event_msg = "Service unregistered from gateway"
251265 except GatewayError as e :
252266 # ignore if service is not registered
253267 logger .warning ("%s: unregistering service: %s" , fmt (run_model ), e )
268+ event_msg = f"Gateway error when unregistering service: { e } "
254269 except (httpx .RequestError , SSHError ) as e :
255270 logger .debug ("Gateway request failed" , exc_info = True )
256271 raise GatewayError (repr (e ))
272+ events .emit (
273+ session ,
274+ event_msg ,
275+ actor = events .SystemActor (),
276+ targets = [
277+ events .Target .from_model (run_model ),
278+ events .Target .from_model (gateway ),
279+ ],
280+ )
257281
258282
259283async def unregister_replica (session : AsyncSession , job_model : JobModel ):
284+ if not job_model .registered : # non-services and unregistered service replicas
285+ return
260286 res = await session .execute (
261287 select (RunModel )
262288 .where (RunModel .id == job_model .run_id )
263- .options (joinedload (RunModel .project ). joinedload ( ProjectModel . backends ) )
289+ .options (joinedload (RunModel .project ))
264290 )
265291 run_model = res .unique ().scalar_one ()
292+ gateway = None
266293 if run_model .gateway_id is not None :
267- conn = await get_or_add_gateway_connection (session , run_model .gateway_id )
294+ gateway , conn = await get_or_add_gateway_connection (session , run_model .gateway_id )
268295 try :
269296 logger .debug (
270297 "%s: unregistering replica from service %s" , fmt (job_model ), job_model .run_id .hex
@@ -282,10 +309,14 @@ async def unregister_replica(session: AsyncSession, job_model: JobModel):
282309 logger .debug ("Gateway request failed" , exc_info = True )
283310 raise GatewayError (repr (e ))
284311 job_model .registered = False
285- logger .info (
286- "%s: service replica unregistered from receiving requests, gateway=%s" ,
287- fmt (job_model ),
288- run_model .gateway_id is not None ,
312+ targets = [events .Target .from_model (job_model )]
313+ if gateway is not None :
314+ targets .append (events .Target .from_model (gateway ))
315+ events .emit (
316+ session ,
317+ "Service replica unregistered from receiving requests" ,
318+ actor = events .SystemActor (),
319+ targets = targets ,
289320 )
290321
291322
@@ -314,7 +345,7 @@ async def update_service_desired_replica_count(
314345) -> None :
315346 stats = None
316347 if run_model .gateway_id is not None :
317- conn = await get_or_add_gateway_connection (session , run_model .gateway_id )
348+ _ , conn = await get_or_add_gateway_connection (session , run_model .gateway_id )
318349 stats = await conn .get_stats (run_model .project .name , run_model .run_name )
319350 replica_groups = configuration .replica_groups
320351 desired_replica_counts = {}
0 commit comments