diff --git a/apps/chrome-extension/src/utils/eventOptimizer.ts b/apps/chrome-extension/src/utils/eventOptimizer.ts index 58b1a8a1d7..3089fa9c1d 100644 --- a/apps/chrome-extension/src/utils/eventOptimizer.ts +++ b/apps/chrome-extension/src/utils/eventOptimizer.ts @@ -1,5 +1,6 @@ import Service from '@midscene/core'; import type { Rect, UIContext } from '@midscene/core'; +import { ScreenshotItem } from '@midscene/core'; import type { RecordedEvent } from '@midscene/recorder'; import { globalModelConfigManager } from '@midscene/shared/env'; import { compositeElementInfoImg } from '@midscene/shared/img'; @@ -135,9 +136,13 @@ export const generateAIDescription = async ( const descriptionPromise = (async () => { try { const mockContext: UIContext = { - screenshotBase64: event.screenshotBefore as string, - size: { width: event.pageInfo.width, height: event.pageInfo.height }, - }; + screenshot: ScreenshotItem.create(event.screenshotBefore as string), + shotSize: { + width: event.pageInfo.width, + height: event.pageInfo.height, + }, + shrunkShotToLogicalRatio: 1, + } as UIContext; const service = new Service(mockContext); const rect = extractRect(event); diff --git a/apps/report/src/components/detail-panel/index.tsx b/apps/report/src/components/detail-panel/index.tsx index 4366a76a7d..718b837df5 100644 --- a/apps/report/src/components/detail-panel/index.tsx +++ b/apps/report/src/components/detail-panel/index.tsx @@ -159,7 +159,7 @@ const DetailPanel = (): JSX.Element => { } contextLocatorView = - highlightElements.length > 0 && activeTask.uiContext?.size ? ( + highlightElements.length > 0 && activeTask.uiContext?.shotSize ? ( diff --git a/apps/site/docs/en/api.mdx b/apps/site/docs/en/api.mdx index 642f99257c..de32d358ac 100644 --- a/apps/site/docs/en/api.mdx +++ b/apps/site/docs/en/api.mdx @@ -27,6 +27,9 @@ All agents share these base options: - Using Node.js: `npx serve` - Using Python: `python -m http.server` or `python3 -m http.server` Then access the report via `http://localhost:3000` (or the port shown in the terminal). +- `screenshotShrinkFactor: number`: Controls the scaling ratio of screenshots to reduce the image size sent to the AI model, thereby reducing token consumption. The default value is 1 (no scaling). If set to 2, the width and height of the screenshot will be halved, and the area will be reduced to a quarter of the original. You can adjust this value based on your actual situation to find the best balance between image clarity and token consumption. + - For mobile devices, setting `screenshotShrinkFactor` to 2 can reduce token consumption while maintaining clarity, but it is not recommended to set it higher than 3, as this may cause the image to be too blurry and affect the AI model's understanding. + - For web pages, if the content is complex or contains a lot of details, it is not recommended to set `screenshotShrinkFactor` to avoid overly blurry screenshots. Additionally, if you want higher clarity for web page screenshots, you can configure Puppeteer or Playwright's `deviceScaleFactor` to 2, which will allow Puppeteer or Playwright to render the page as if it were a high-definition screen. ### Custom model configuration @@ -777,7 +780,7 @@ function aiLocate( height: number; }; center: [number, number]; - scale: number; // device pixel ratio + dpr: number; // device pixel ratio }>; ``` diff --git a/apps/site/docs/en/integrate-with-any-interface.mdx b/apps/site/docs/en/integrate-with-any-interface.mdx index b6f1d5f9e7..2688f7200b 100644 --- a/apps/site/docs/en/integrate-with-any-interface.mdx +++ b/apps/site/docs/en/integrate-with-any-interface.mdx @@ -71,7 +71,6 @@ export interface SampleDeviceConfig { deviceName?: string; width?: number; height?: number; - dpr?: number; } /** @@ -86,7 +85,6 @@ export class SampleDevice implements AbstractInterface { deviceName: config.deviceName || 'Sample Device', width: config.width || 1920, height: config.height || 1080, - dpr: config.dpr || 1, }; } @@ -101,12 +99,12 @@ export class SampleDevice implements AbstractInterface { /** * Required: Get interface dimensions + * The width and height here refer to the logical size of the interface, not considering the device pixel ratio (dpr). The coordinates obtained from actions like defineActionTap are also based on this logical coordinate system. You can convert logical coordinates to physical coordinates in your action implementations if needed. */ async size(): Promise { return { width: this.config.width, height: this.config.height, - dpr: this.config.dpr, }; } @@ -287,7 +285,7 @@ These are the required methods that you need to implement: - `interfaceType: string`: define a name for the interface, this will not be provided to the AI model - `screenshotBase64(): Promise`: take a screenshot of the interface and return the base64 string with the `'data:image/` prefix -- `size(): Promise`: the size and dpr of the interface, which is an object with the `width`, `height`, and `dpr` properties +- `size(): Promise`: the size of the interface, which is an object with the `width` and `height` properties - `actionSpace(): DeviceAction[] | Promise`: the action space of the interface, which is an array of `DeviceAction` objects. Use predefined actions or define any custom action. Type signatures: diff --git a/apps/site/docs/zh/api.mdx b/apps/site/docs/zh/api.mdx index 4139e69997..5f93423981 100644 --- a/apps/site/docs/zh/api.mdx +++ b/apps/site/docs/zh/api.mdx @@ -29,6 +29,9 @@ Midscene 针对每个不同环境都有对应的 Agent。每个 Agent 的构造 - 使用 Node.js:`npx serve` - 使用 Python:`python -m http.server` 或 `python3 -m http.server` 然后通过 `http://localhost:3000`(或终端显示的端口)访问报告。 +- `screenshotShrinkFactor: number`: 控制截图的缩放比例,以减少发送给 AI 模型的图像大小,从而减少 token 消耗。默认值为 1(不缩放)。如果将其设置为 2,则截图的宽高将缩小为原来的一半,面积缩小为原来的四分之一。你可以根据实际情况调整这个值,以在图像清晰度和 token 消耗之间找到最佳平衡点。 + - 对于移动端设备,将 `screenshotShrinkFactor` 设置为 2 可以在保持清晰度的同时减少 token 的消耗,但不建议设置的值超过 3,否则可能会导致图像过于模糊,影响 AI 模型的理解。 + - 对于 Web 页面,如果页面内容比较复杂或包含大量细节,不建议设置 `screenshotShrinkFactor`,以避免截图过于模糊。此外,如果为了让 Web 页面截图有更高的清晰度,可以配置 Puppeteer 或 Playwright 的 `deviceScaleFactor` 为 2,这可以让 Puppeteer 或 Playwright 按照高清屏的方式来渲染页面。 ### 自定义模型 @@ -773,7 +776,7 @@ function aiLocate( height: number; }; center: [number, number]; - scale: number; // device pixel ratio + dpr: number; // device pixel ratio }>; ``` diff --git a/apps/site/docs/zh/integrate-with-any-interface.mdx b/apps/site/docs/zh/integrate-with-any-interface.mdx index 9cf4706334..ed261d325d 100644 --- a/apps/site/docs/zh/integrate-with-any-interface.mdx +++ b/apps/site/docs/zh/integrate-with-any-interface.mdx @@ -70,7 +70,6 @@ export interface SampleDeviceConfig { deviceName?: string; width?: number; height?: number; - dpr?: number; } /** @@ -85,7 +84,6 @@ export class SampleDevice implements AbstractInterface { deviceName: config.deviceName || 'Sample Device', width: config.width || 1920, height: config.height || 1080, - dpr: config.dpr || 1, }; } @@ -100,12 +98,12 @@ export class SampleDevice implements AbstractInterface { /** * 必需:获取界面尺寸 + * 这里的宽高是指界面的逻辑尺寸(logical size),不需要考虑设备像素比(dpr)。defineActionTap 等动作得到的坐标也是基于这个逻辑尺寸的坐标系。你可以在动作实现中根据需要将逻辑坐标转换为物理坐标。 */ async size(): Promise { return { width: this.config.width, height: this.config.height, - dpr: this.config.dpr, }; } @@ -260,7 +258,7 @@ import { AbstractInterface } from '@midscene/core'; - `interfaceType: string`:为界面定义一个名称,这不会提供给 AI 模型 - `screenshotBase64(): Promise`:截取界面的屏幕截图并返回带有 `'data:image/` 前缀的 base64 字符串 -- `size(): Promise`:界面的大小和 dpr,它是一个具有 `width`、`height` 和 `dpr` 属性的对象 +- `size(): Promise`:界面的大小,它是一个具有 `width` 和 `height` 属性的对象 - `actionSpace(): DeviceAction[] | Promise`:界面的动作空间,它是一个 `DeviceAction` 对象数组。在这里你可以使用预定义动作,或是自定义交互操作。 类型签名: diff --git a/packages/android/src/device.ts b/packages/android/src/device.ts index e23870693e..2034d39f64 100644 --- a/packages/android/src/device.ts +++ b/packages/android/src/device.ts @@ -505,7 +505,6 @@ ${Object.keys(size) return { physicalWidth: Number.parseInt(match[1], 10), physicalHeight: Number.parseInt(match[2], 10), - dpr: this.devicePixelRatio, orientation: screenSize.orientation, isCurrentOrientation: screenSize.isCurrentOrientation, }; @@ -890,7 +889,6 @@ ${Object.keys(size) return { width: logicalWidth, height: logicalHeight, - dpr: this.devicePixelRatio, }; } diff --git a/packages/android/src/scrcpy-device-adapter.ts b/packages/android/src/scrcpy-device-adapter.ts index 7057d58b73..a4cb2ba8ba 100644 --- a/packages/android/src/scrcpy-device-adapter.ts +++ b/packages/android/src/scrcpy-device-adapter.ts @@ -23,7 +23,6 @@ interface ResolvedScrcpyConfig { export interface DevicePhysicalInfo { physicalWidth: number; physicalHeight: number; - dpr: number; orientation: number; isCurrentOrientation?: boolean; } @@ -174,7 +173,6 @@ export class ScrcpyDeviceAdapter { return { width: resolution.width, height: resolution.height, - dpr: deviceInfo.dpr, }; } diff --git a/packages/android/tests/unit-test/agent.test.ts b/packages/android/tests/unit-test/agent.test.ts index 256b0e6bdd..c1eb1296ed 100644 --- a/packages/android/tests/unit-test/agent.test.ts +++ b/packages/android/tests/unit-test/agent.test.ts @@ -137,7 +137,7 @@ describe('AndroidAgent', () => { interfaceType: 'android', actionSpace: vi.fn().mockReturnValue([]), screenshotBase64: vi.fn(), - size: vi.fn().mockResolvedValue({ width: 0, height: 0, dpr: 1 }), + size: vi.fn().mockResolvedValue({ width: 0, height: 0 }), getElementsInfo: vi.fn(), url: vi.fn(), launch: vi.fn(), @@ -165,7 +165,7 @@ describe('AndroidAgent', () => { interfaceType: 'android', actionSpace: vi.fn().mockReturnValue([]), screenshotBase64: vi.fn(), - size: vi.fn().mockResolvedValue({ width: 0, height: 0, dpr: 1 }), + size: vi.fn().mockResolvedValue({ width: 0, height: 0 }), getElementsInfo: vi.fn(), url: vi.fn(), launch: vi.fn(), @@ -192,7 +192,7 @@ describe('AndroidAgent', () => { interfaceType: 'android', actionSpace: vi.fn().mockReturnValue([]), screenshotBase64: vi.fn(), - size: vi.fn().mockResolvedValue({ width: 0, height: 0, dpr: 1 }), + size: vi.fn().mockResolvedValue({ width: 0, height: 0 }), getElementsInfo: vi.fn(), url: vi.fn(), launch: vi.fn(), diff --git a/packages/android/tests/unit-test/page.test.ts b/packages/android/tests/unit-test/page.test.ts index 43700e52c5..76280dd8db 100644 --- a/packages/android/tests/unit-test/page.test.ts +++ b/packages/android/tests/unit-test/page.test.ts @@ -131,7 +131,7 @@ describe('AndroidDevice', () => { const size1 = await device.size(); const size2 = await device.size(); - expect(size1).toEqual({ width: 540, height: 960, dpr: 2 }); + expect(size1).toEqual({ width: 540, height: 960 }); expect(size2).toEqual(size1); // Caching is removed, so it should be called twice expect(vi.spyOn(device as any, 'getScreenSize')).toHaveBeenCalledTimes(2); @@ -194,7 +194,6 @@ describe('AndroidDevice', () => { vi.spyOn(device, 'size').mockResolvedValue({ width: 1080, height: 1920, - dpr: 2, }); vi.spyOn(ImgUtils, 'isValidImageBuffer').mockReturnValue(true); vi.spyOn(ImgUtils, 'resizeAndConvertImgBuffer').mockImplementation( @@ -1174,7 +1173,6 @@ describe('AndroidDevice', () => { vi.spyOn(device, 'size').mockResolvedValue({ width: 1080, height: 1920, - dpr: 1, }); }); @@ -1976,7 +1974,6 @@ describe('AndroidDevice', () => { vi.spyOn(deviceWithDisplay, 'size').mockResolvedValue({ width: 1080, height: 1920, - dpr: 2, }); await deviceWithDisplay.screenshotBase64(); @@ -2111,7 +2108,6 @@ describe('AndroidDevice', () => { vi.spyOn(deviceWithDisplay, 'size').mockResolvedValue({ width: 1080, height: 1920, - dpr: 2, }); await deviceWithDisplay.screenshotBase64(); diff --git a/packages/android/tests/unit-test/scrcpy-adapter.test.ts b/packages/android/tests/unit-test/scrcpy-adapter.test.ts index 7d87beba35..906ab9905d 100644 --- a/packages/android/tests/unit-test/scrcpy-adapter.test.ts +++ b/packages/android/tests/unit-test/scrcpy-adapter.test.ts @@ -45,7 +45,6 @@ vi.mock('@midscene/shared/img', () => ({ const defaultDeviceInfo: DevicePhysicalInfo = { physicalWidth: 1080, physicalHeight: 1920, - dpr: 2.625, orientation: 0, }; @@ -159,7 +158,6 @@ describe('ScrcpyDeviceAdapter', () => { const highRes: DevicePhysicalInfo = { physicalWidth: 1440, physicalHeight: 3120, - dpr: 3.2, orientation: 0, }; const config = adapter.resolveConfig(highRes); @@ -175,7 +173,6 @@ describe('ScrcpyDeviceAdapter', () => { const highRes: DevicePhysicalInfo = { physicalWidth: 1440, physicalHeight: 3120, - dpr: 3.2, orientation: 0, }; const config = adapter.resolveConfig(highRes); @@ -187,7 +184,6 @@ describe('ScrcpyDeviceAdapter', () => { const landscape: DevicePhysicalInfo = { physicalWidth: 1920, physicalHeight: 1080, - dpr: 2, orientation: 1, }; const config = adapter.resolveConfig(landscape); @@ -237,7 +233,6 @@ describe('ScrcpyDeviceAdapter', () => { expect(size).toEqual({ width: 576, height: 1024, - dpr: 2.625, }); }); }); diff --git a/packages/computer/src/device.ts b/packages/computer/src/device.ts index e56a4bd3d9..c7c72accd4 100644 --- a/packages/computer/src/device.ts +++ b/packages/computer/src/device.ts @@ -451,7 +451,6 @@ Available Displays: ${displays.length > 0 ? displays.map((d) => d.name).join(', return { width: screenSize.width, height: screenSize.height, - dpr: 1, // Desktop typically uses logical pixels }; } catch (error) { debugDevice(`Failed to get screen size: ${error}`); diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts index d247cd61f6..f07f4ec4dd 100644 --- a/packages/core/src/agent/agent.ts +++ b/packages/core/src/agent/agent.ts @@ -62,7 +62,6 @@ import { globalConfigManager, globalModelConfigManager, } from '@midscene/shared/env'; -import { imageInfoOfBase64, resizeImgBase64 } from '@midscene/shared/img'; import { getDebug } from '@midscene/shared/logger'; import { assert, ifInBrowser, uuid } from '@midscene/shared/utils'; import { defineActionSleep } from '../device'; @@ -210,16 +209,6 @@ export class Agent< */ private hasWarnedNonVLModel = false; - /** - * Screenshot scale factor derived from actual screenshot dimensions - */ - private screenshotScale?: number; - - /** - * Internal promise to deduplicate screenshot scale computation - */ - private screenshotScalePromise?: Promise; - private executionDumpIndexByRunner = new WeakMap(); private fullActionSpace: DeviceAction[]; @@ -248,54 +237,6 @@ export class Agent< } } - /** - * Lazily compute the ratio between the physical screenshot width and the logical page width - */ - private async getScreenshotScale(context: UIContext): Promise { - if (this.screenshotScale !== undefined) { - return this.screenshotScale; - } - - if (!this.screenshotScalePromise) { - this.screenshotScalePromise = (async () => { - const pageWidth = context.size?.width; - assert( - pageWidth && pageWidth > 0, - `Invalid page width when computing screenshot scale: ${pageWidth}`, - ); - - debug('will get image info of base64'); - const screenshotBase64 = context.screenshot.base64; - const { width: screenshotWidth } = - await imageInfoOfBase64(screenshotBase64); - debug('image info of base64 done'); - - assert( - Number.isFinite(screenshotWidth) && screenshotWidth > 0, - `Invalid screenshot width when computing screenshot scale: ${screenshotWidth}`, - ); - - const computedScale = screenshotWidth / pageWidth; - assert( - Number.isFinite(computedScale) && computedScale > 0, - `Invalid computed screenshot scale: ${computedScale}`, - ); - - debug( - `Computed screenshot scale ${computedScale} from screenshot width ${screenshotWidth} and page width ${pageWidth}`, - ); - return computedScale; - })(); - } - - try { - this.screenshotScale = await this.screenshotScalePromise; - return this.screenshotScale; - } finally { - this.screenshotScalePromise = undefined; - } - } - private resolveReplanningCycleLimit( modelConfigForPlanning: IModelConfig, ): number { @@ -432,38 +373,10 @@ export class Agent< } // Get original context - let context: UIContext; - if (this.interface.getContext) { - debug('Using page.getContext for action:', action); - context = await this.interface.getContext(); - } else { - debug('Using commonContextParser'); - context = await commonContextParser(this.interface, { - uploadServerUrl: this.modelConfigManager.getUploadTestServerUrl(), - }); - } - - debug('will get screenshot scale'); - const computedScreenshotScale = await this.getScreenshotScale(context); - debug('computedScreenshotScale', computedScreenshotScale); - - if (computedScreenshotScale !== 1) { - const scaleForLog = Number.parseFloat(computedScreenshotScale.toFixed(4)); - debug( - `Applying computed screenshot scale: ${scaleForLog} (resize to logical size)`, - ); - const targetWidth = Math.round(context.size.width); - const targetHeight = Math.round(context.size.height); - debug(`Resizing screenshot to ${targetWidth}x${targetHeight}`); - const currentScreenshotBase64 = context.screenshot.base64; - const resizedBase64 = await resizeImgBase64(currentScreenshotBase64, { - width: targetWidth, - height: targetHeight, - }); - context.screenshot = ScreenshotItem.create(resizedBase64); - } else { - debug(`screenshot scale=${computedScreenshotScale}`); - } + const context = await commonContextParser(this.interface, { + uploadServerUrl: this.modelConfigManager.getUploadTestServerUrl(), + screenshotShrinkFactor: this.opts.screenshotShrinkFactor, + }); return context; } @@ -1156,19 +1069,11 @@ export class Agent< const { element } = output; - const dprValue = await (this.interface.size() as any).dpr; - const dprEntry = dprValue - ? { - dpr: dprValue, - } - : {}; return { rect: element?.rect, center: element?.center, - ...dprEntry, - } as Pick & { - dpr?: number; // this field is deprecated - }; + dpr: element?.dpr, + } as Pick; } async aiAssert( diff --git a/packages/core/src/agent/task-builder.ts b/packages/core/src/agent/task-builder.ts index 96c1b1b15a..0a59837424 100644 --- a/packages/core/src/agent/task-builder.ts +++ b/packages/core/src/agent/task-builder.ts @@ -27,6 +27,9 @@ import { ifPlanLocateParamIsBbox, matchElementFromCache, matchElementFromPlan, + transformLogicalElementToScreenshot, + transformLogicalRectToScreenshotRect, + transformScreenshotElementToLogical, } from './utils'; const debug = getDebug('agent:task-builder'); @@ -280,6 +283,35 @@ export class TaskBuilder { } } + // Transform coordinates from screenshot space to logical space if needed + // This is necessary when shrunkShotToLogicalRatio !== 1 + const { shrunkShotToLogicalRatio } = uiContext; + if (shrunkShotToLogicalRatio === undefined) { + throw new Error( + 'shrunkShotToLogicalRatio is not defined in Action task', + ); + } + if (shrunkShotToLogicalRatio !== 1) { + debug( + `Transforming coordinates for action ${action.name} with shrunkShotToLogicalRatio=${shrunkShotToLogicalRatio}`, + ); + + for (const field of locateFields) { + if (param[field] && typeof param[field] === 'object') { + const element = param[field] as LocateResultElement; + if (element.center && element.rect) { + param[field] = transformScreenshotElementToLogical( + element, + shrunkShotToLogicalRatio, + ); + debug( + `Transformed ${field}: center ${element.center} -> ${param[field].center}`, + ); + } + } + } + } + debug('calling action', action.name); const actionFn = action.call.bind(this.interface); const actionResult = await actionFn(param, taskContext); @@ -368,6 +400,14 @@ export class TaskBuilder { assert(uiContext, 'uiContext is required for Service task'); + const { shrunkShotToLogicalRatio } = uiContext; + + if (shrunkShotToLogicalRatio === undefined) { + throw new Error( + 'shrunkShotToLogicalRatio is not defined in locate task', + ); + } + let locateDump: ServiceDump | undefined; let locateResult: LocateResultWithDump | undefined; @@ -410,21 +450,27 @@ export class TaskBuilder { // xpath locate failed, allow fallback to cache or AI locate } } + const elementFromXpath = rectFromXpath ? generateElementByRect( - rectFromXpath, + // rectFromXpath is in logical coordinates, which should be transformed to screenshot coordinates; + transformLogicalRectToScreenshotRect( + rectFromXpath, + shrunkShotToLogicalRatio, + ), typeof param.prompt === 'string' ? param.prompt : param.prompt?.prompt || '', ) : undefined; + const isXpathHit = !!elementFromXpath; const cachePrompt = param.prompt; const locateCacheRecord = this.taskCache?.matchLocateCache(cachePrompt); const cacheEntry = locateCacheRecord?.cacheContent?.cache; - const elementFromCache = + const elementFromCacheResult = isPlanHit || isXpathHit ? null : await matchElementFromCache( @@ -436,6 +482,15 @@ export class TaskBuilder { cachePrompt, param.cacheable, ); + + // elementFromCacheResult is in logical coordinates, which should be transformed to screenshot coordinates; + const elementFromCache = elementFromCacheResult + ? transformLogicalElementToScreenshot( + elementFromCacheResult, + shrunkShotToLogicalRatio, + ) + : undefined; + const isCacheHit = !!elementFromCache; let elementFromAiLocate: LocateResultElement | null | undefined; @@ -485,8 +540,23 @@ export class TaskBuilder { ) { if (this.interface.cacheFeatureForPoint) { try { + // Transform coordinates to logical space for cacheFeatureForPoint + // cacheFeatureForPoint needs logical coordinates to locate elements in DOM + let pointForCache: [number, number] = element.center; + if (shrunkShotToLogicalRatio !== 1) { + pointForCache = [ + Math.round(element.center[0] / shrunkShotToLogicalRatio), + Math.round(element.center[1] / shrunkShotToLogicalRatio), + ]; + debug( + 'Transformed coordinates for cacheFeatureForPoint: %o -> %o', + element.center, + pointForCache, + ); + } + const feature = await this.interface.cacheFeatureForPoint( - element.center, + pointForCache, { targetDescription: typeof param.prompt === 'string' @@ -564,7 +634,11 @@ export class TaskBuilder { return { output: { - element, + element: { + ...element, + // backward compatibility for aiLocate, which return value needs a dpr field + dpr: uiContext.deprecatedDpr, + }, }, hitBy, }; diff --git a/packages/core/src/agent/utils.ts b/packages/core/src/agent/utils.ts index c87017cb78..1093bf5f21 100644 --- a/packages/core/src/agent/utils.ts +++ b/packages/core/src/agent/utils.ts @@ -5,6 +5,7 @@ import type { ElementCacheFeature, LocateResultElement, PlanningLocateParam, + Rect, UIContext, } from '@/types'; import { uploadTestInfoToServer } from '@/utils'; @@ -14,6 +15,7 @@ import { globalConfigManager, } from '@midscene/shared/env'; import { generateElementByRect } from '@midscene/shared/extractor'; +import { imageInfoOfBase64, resizeImgBase64 } from '@midscene/shared/img'; import { getDebug } from '@midscene/shared/logger'; import { _keyDefinitions } from '@midscene/shared/us-keyboard-layout'; import { assert, logMsg, uuid } from '@midscene/shared/utils'; @@ -21,37 +23,117 @@ import dayjs from 'dayjs'; import type { TaskCache } from './task-cache'; import { debug as cacheDebug } from './task-cache'; -const debugProfile = getDebug('web:tool:profile'); - export async function commonContextParser( interfaceInstance: AbstractInterface, - _opt: { uploadServerUrl?: string }, + _opt: { uploadServerUrl?: string; screenshotShrinkFactor?: number }, ): Promise { + const debug = getDebug('commonContextParser'); + assert(interfaceInstance, 'interfaceInstance is required'); - debugProfile('Getting interface description'); + debug('Getting interface description'); const description = interfaceInstance.describe?.() || ''; - debugProfile('Interface description end'); + debug('Interface description end'); - debugProfile('Uploading test info to server'); + debug('Uploading test info to server'); uploadTestInfoToServer({ testUrl: description, serverUrl: _opt.uploadServerUrl, }); - debugProfile('UploadTestInfoToServer end'); + debug('UploadTestInfoToServer end'); + + debug('will get size'); + const interfaceSize = await interfaceInstance.size(); + const { width: logicalWidth, height: logicalHeight } = interfaceSize; + + if ((interfaceSize as unknown as { dpr: number }).dpr) { + console.warn( + 'Warning: return value of interface.size() include a dpr property, which is not expected and ignored. ', + ); + } + + if (!Number.isFinite(logicalWidth) || !Number.isFinite(logicalHeight)) { + throw new Error( + `Invalid interface size: width and height must be finite numbers. Received width: ${logicalWidth}, height: ${logicalHeight}`, + ); + } + + if (logicalWidth <= 0 || logicalHeight <= 0) { + throw new Error( + `Invalid interface size: width and height must be positive numbers. Received width: ${logicalWidth}, height: ${logicalHeight}`, + ); + } + + debug(`size: ${logicalWidth}x${logicalHeight}`); const screenshotBase64 = await interfaceInstance.screenshotBase64(); assert(screenshotBase64!, 'screenshotBase64 is required'); - debugProfile('will get size'); - const size = await interfaceInstance.size(); - debugProfile(`size: ${size.width}x${size.height} dpr: ${size.dpr}`); + // Get physical screenshot dimensions + debug('will get screenshot dimensions'); + const { width: imgWidth, height: imgHeight } = + await imageInfoOfBase64(screenshotBase64); - const screenshot = ScreenshotItem.create(screenshotBase64!); + if (!Number.isFinite(imgWidth) || !Number.isFinite(imgHeight)) { + throw new Error( + `Invalid screenshot dimensions: width and height must be finite numbers. Received width: ${imgWidth}, height: ${imgHeight}`, + ); + } + if (imgWidth <= 0 || imgHeight <= 0) { + throw new Error( + `Invalid screenshot dimensions: width and height must be positive numbers. Received width: ${imgWidth}, height: ${imgHeight}`, + ); + } + debug('screenshot dimensions', imgWidth, 'x', imgHeight); + + // Validate user-specified shrink factor + const userShrinkFactor = _opt.screenshotShrinkFactor ?? 1; + + if (!Number.isFinite(userShrinkFactor) || userShrinkFactor < 1) { + throw new Error( + `Invalid screenshotShrinkFactor: must be a finite number >= 1. Received: ${userShrinkFactor}`, + ); + } + + const dpr = imgWidth / logicalWidth; + + debug('calculated dpr:', dpr); + + const shrunkShotToLogicalRatio = dpr / userShrinkFactor; + + debug('shrunkShotToLogicalRatio', shrunkShotToLogicalRatio); + + if (userShrinkFactor !== 1) { + const targetWidth = Math.round(imgWidth / userShrinkFactor); + const targetHeight = Math.round(imgHeight / userShrinkFactor); + + debug( + `Applying screenshot shrink factor: ${userShrinkFactor} (physical: ${imgWidth}x${imgHeight} -> target: ${targetWidth}x${targetHeight})`, + ); + + const resizedBase64 = await resizeImgBase64(screenshotBase64, { + width: targetWidth, + height: targetHeight, + }); + return { + shotSize: { + width: targetWidth, + height: targetHeight, + }, + deprecatedDpr: dpr, + screenshot: ScreenshotItem.create(resizedBase64), + shrunkShotToLogicalRatio, + }; + } return { - size, - screenshot, + shotSize: { + width: imgWidth, + height: imgHeight, + }, + deprecatedDpr: dpr, + screenshot: ScreenshotItem.create(screenshotBase64), + shrunkShotToLogicalRatio, }; } @@ -251,3 +333,77 @@ export const parsePrompt = ( : undefined, }; }; + +/** + * Transform coordinates from screenshot coordinate system to logical coordinate system. + * When shrunkShotToLogicalRatio > 1, the screenshot is larger than logical size, + * so we need to divide coordinates by shrunkShotToLogicalRatio. + * + * @param element - The locate result element with coordinates in screenshot space + * @param shrunkShotToLogicalRatio - The ratio of screenshot size to logical size + * @returns A new element with coordinates transformed to logical space + */ +export const transformScreenshotElementToLogical = ( + element: LocateResultElement, + shrunkShotToLogicalRatio: number, +): LocateResultElement => { + if (shrunkShotToLogicalRatio === 1) { + return element; + } + + return { + ...element, + center: [ + Math.round(element.center[0] / shrunkShotToLogicalRatio), + Math.round(element.center[1] / shrunkShotToLogicalRatio), + ], + rect: { + ...element.rect, + left: Math.round(element.rect.left / shrunkShotToLogicalRatio), + top: Math.round(element.rect.top / shrunkShotToLogicalRatio), + width: Math.round(element.rect.width / shrunkShotToLogicalRatio), + height: Math.round(element.rect.height / shrunkShotToLogicalRatio), + }, + }; +}; + +export const transformLogicalElementToScreenshot = ( + element: LocateResultElement, + shrunkShotToLogicalRatio: number, +): LocateResultElement => { + if (shrunkShotToLogicalRatio === 1) { + return element; + } + + return { + ...element, + center: [ + Math.round(element.center[0] * shrunkShotToLogicalRatio), + Math.round(element.center[1] * shrunkShotToLogicalRatio), + ], + rect: { + ...element.rect, + left: Math.round(element.rect.left * shrunkShotToLogicalRatio), + top: Math.round(element.rect.top * shrunkShotToLogicalRatio), + width: Math.round(element.rect.width * shrunkShotToLogicalRatio), + height: Math.round(element.rect.height * shrunkShotToLogicalRatio), + }, + }; +}; + +export const transformLogicalRectToScreenshotRect = ( + rect: Rect, + shrunkShotToLogicalRatio: number, +): Rect => { + if (shrunkShotToLogicalRatio === 1) { + return rect; + } + + return { + ...rect, + left: Math.round(rect.left * shrunkShotToLogicalRatio), + top: Math.round(rect.top * shrunkShotToLogicalRatio), + width: Math.round(rect.width * shrunkShotToLogicalRatio), + height: Math.round(rect.height * shrunkShotToLogicalRatio), + }; +}; diff --git a/packages/core/src/ai-model/auto-glm/planning.ts b/packages/core/src/ai-model/auto-glm/planning.ts index 7a619711e4..4e9dad5c4d 100644 --- a/packages/core/src/ai-model/auto-glm/planning.ts +++ b/packages/core/src/ai-model/auto-glm/planning.ts @@ -63,7 +63,7 @@ export async function autoGLMPlanning( const parsedAction = parseAction(parsedResponse); debug('Parsed action object:', parsedAction); - transformedActions = transformAutoGLMAction(parsedAction, context.size); + transformedActions = transformAutoGLMAction(parsedAction, context.shotSize); debug('Transformed actions:', transformedActions); } catch (parseError) { // Throw AIResponseParseError with usage and rawResponse preserved diff --git a/packages/core/src/ai-model/inspect.ts b/packages/core/src/ai-model/inspect.ts index c207f18d7c..5d4b3179ed 100644 --- a/packages/core/src/ai-model/inspect.ts +++ b/packages/core/src/ai-model/inspect.ts @@ -150,8 +150,8 @@ export async function AiLocateElement(options: { : systemPromptToLocateElement(modelFamily); let imagePayload = screenshotBase64; - let imageWidth = context.size.width; - let imageHeight = context.size.height; + let imageWidth = context.shotSize.width; + let imageHeight = context.shotSize.height; let originalImageWidth = imageWidth; let originalImageHeight = imageHeight; @@ -442,12 +442,12 @@ export async function AiLocateSection(options: { if (sectionBbox) { const targetRect = adaptBboxToRect( sectionBbox, - context.size.width, - context.size.height, + context.shotSize.width, + context.shotSize.height, 0, 0, - context.size.width, - context.size.height, + context.shotSize.width, + context.shotSize.height, modelFamily, ); debugSection('original targetRect %j', targetRect); @@ -460,12 +460,12 @@ export async function AiLocateSection(options: { .map((bbox) => { return adaptBboxToRect( bbox, - context.size.width, - context.size.height, + context.shotSize.width, + context.shotSize.height, 0, 0, - context.size.width, - context.size.height, + context.shotSize.width, + context.shotSize.height, modelFamily, ); }); @@ -475,7 +475,7 @@ export async function AiLocateSection(options: { const mergedRect = mergeRects([targetRect, ...referenceRects]); debugSection('mergedRect %j', mergedRect); - sectionRect = expandSearchArea(mergedRect, context.size); + sectionRect = expandSearchArea(mergedRect, context.shotSize); debugSection('expanded sectionRect %j', sectionRect); } diff --git a/packages/core/src/ai-model/llm-planning.ts b/packages/core/src/ai-model/llm-planning.ts index b3d04429a1..369331bc71 100644 --- a/packages/core/src/ai-model/llm-planning.ts +++ b/packages/core/src/ai-model/llm-planning.ts @@ -101,7 +101,7 @@ export async function plan( }, ): Promise { const { context, modelConfig, conversationHistory } = opts; - const { size } = context; + const { shotSize } = context; const screenshotBase64 = context.screenshot.base64; const { modelFamily } = modelConfig; @@ -115,8 +115,8 @@ export async function plan( }); let imagePayload = screenshotBase64; - let imageWidth = size.width; - let imageHeight = size.height; + let imageWidth = shotSize.width; + let imageHeight = shotSize.height; const rightLimit = imageWidth; const bottomLimit = imageHeight; diff --git a/packages/core/src/ai-model/ui-tars-planning.ts b/packages/core/src/ai-model/ui-tars-planning.ts index d068c5e344..efbfc6255d 100644 --- a/packages/core/src/ai-model/ui-tars-planning.ts +++ b/packages/core/src/ai-model/ui-tars-planning.ts @@ -93,13 +93,13 @@ export async function uiTarsPlanning( try { convertedText = convertBboxToCoordinates(res.content); - const { size } = context; + const { shotSize } = context; const parseResult = actionParser({ prediction: convertedText, factor: [1000, 1000], screenContext: { - width: size.width, - height: size.height, + width: shotSize.width, + height: shotSize.height, }, modelVer: uiTarsModelVersion, }); @@ -115,7 +115,7 @@ export async function uiTarsPlanning( ); } - const { size } = context; + const { shotSize } = context; debug( 'ui-tars modelVer', @@ -131,14 +131,14 @@ export async function uiTarsPlanning( const actionType = (action.action_type || '').toLowerCase(); if (actionType === 'click') { assert(action.action_inputs.start_box, 'start_box is required'); - const point = getPoint(action.action_inputs.start_box, size); + const point = getPoint(action.action_inputs.start_box, shotSize); const locate = { prompt: action.thought || '', bbox: pointToBbox( { x: point[0], y: point[1] }, - size.width, - size.height, + shotSize.width, + shotSize.height, ), }; @@ -150,14 +150,14 @@ export async function uiTarsPlanning( }); } else if (actionType === 'left_double') { assert(action.action_inputs.start_box, 'start_box is required'); - const point = getPoint(action.action_inputs.start_box, size); + const point = getPoint(action.action_inputs.start_box, shotSize); const locate = { prompt: action.thought || '', bbox: pointToBbox( { x: point[0], y: point[1] }, - size.width, - size.height, + shotSize.width, + shotSize.height, ), }; @@ -170,14 +170,14 @@ export async function uiTarsPlanning( }); } else if (actionType === 'right_single') { assert(action.action_inputs.start_box, 'start_box is required'); - const point = getPoint(action.action_inputs.start_box, size); + const point = getPoint(action.action_inputs.start_box, shotSize); const locate = { prompt: action.thought || '', bbox: pointToBbox( { x: point[0], y: point[1] }, - size.width, - size.height, + shotSize.width, + shotSize.height, ), }; @@ -191,8 +191,8 @@ export async function uiTarsPlanning( } else if (actionType === 'drag') { assert(action.action_inputs.start_box, 'start_box is required'); assert(action.action_inputs.end_box, 'end_box is required'); - const startPoint = getPoint(action.action_inputs.start_box, size); - const endPoint = getPoint(action.action_inputs.end_box, size); + const startPoint = getPoint(action.action_inputs.start_box, shotSize); + const endPoint = getPoint(action.action_inputs.end_box, shotSize); transformActions.push({ type: 'DragAndDrop', param: { @@ -200,16 +200,16 @@ export async function uiTarsPlanning( prompt: action.thought || '', bbox: pointToBbox( { x: startPoint[0], y: startPoint[1] }, - size.width, - size.height, + shotSize.width, + shotSize.height, ), }, to: { prompt: action.thought || '', bbox: pointToBbox( { x: endPoint[0], y: endPoint[1] }, - size.width, - size.height, + shotSize.width, + shotSize.height, ), }, }, diff --git a/packages/core/src/common.ts b/packages/core/src/common.ts index 452286f19c..2c2c199844 100644 --- a/packages/core/src/common.ts +++ b/packages/core/src/common.ts @@ -454,7 +454,6 @@ export const PointSchema = z.object({ export const SizeSchema = z.object({ width: z.number(), height: z.number(), - dpr: z.number().optional(), }); export const RectSchema = PointSchema.and(SizeSchema).and( diff --git a/packages/core/src/device/index.ts b/packages/core/src/device/index.ts index af3e7d4425..8684171257 100644 --- a/packages/core/src/device/index.ts +++ b/packages/core/src/device/index.ts @@ -53,9 +53,6 @@ export abstract class AbstractInterface { // @deprecated do NOT extend this method abstract evaluateJavaScript?(script: string): Promise; - // @deprecated do NOT extend this method - abstract getContext?(): Promise; - /** * Get the current time from the device. * Returns the device's current timestamp in milliseconds. diff --git a/packages/core/src/service/index.ts b/packages/core/src/service/index.ts index 73550746fa..7d61a45058 100644 --- a/packages/core/src/service/index.ts +++ b/packages/core/src/service/index.ts @@ -318,7 +318,7 @@ export default class Service { ): Promise> { assert(target, 'target is required for service.describe'); const context = await this.contextRetrieverFn(); - const { size } = context; + const { shotSize } = context; const screenshotBase64 = context.screenshot.base64; assert(screenshotBase64, 'screenshot is required for service.describe'); // The result of the "describe" function will be used for positioning, so essentially it is a form of grounding. @@ -338,7 +338,7 @@ export default class Service { let imagePayload = await compositeElementInfoImg({ inputImgBase64: screenshotBase64, - size, + size: shotSize, elementsPositionInfo: [ { rect: targetRect, @@ -348,12 +348,12 @@ export default class Service { }); if (opt?.deepThink) { - const searchArea = expandSearchArea(targetRect, size); + const searchArea = expandSearchArea(targetRect, shotSize); // Only crop when the search area covers at least 50% of the screen // in both dimensions. Small crops (e.g., 500px on 1920x1080) lose // too much context and cause model hallucinations. - const widthRatio = searchArea.width / size.width; - const heightRatio = searchArea.height / size.height; + const widthRatio = searchArea.width / shotSize.width; + const heightRatio = searchArea.height / shotSize.height; if (widthRatio >= 0.5 && heightRatio >= 0.5) { debug('describe: cropping to searchArea', searchArea); const croppedResult = await cropByRect( @@ -367,8 +367,8 @@ export default class Service { 'describe: skip cropping, search area too small (%dx%d on %dx%d)', searchArea.width, searchArea.height, - size.width, - size.height, + shotSize.width, + shotSize.height, ); } } diff --git a/packages/core/src/types.ts b/packages/core/src/types.ts index 7241c425cf..001aeb1dea 100644 --- a/packages/core/src/types.ts +++ b/packages/core/src/types.ts @@ -110,11 +110,33 @@ export interface AgentDescribeElementAtPointResult { */ export abstract class UIContext { + /** + * screenshot of the current UI state. which size is shotSize(be shrunk by screenshotShrinkFactor), + */ abstract screenshot: ScreenshotItem; - abstract size: Size; + /** + * screenshot size after shrinking + */ + abstract shotSize: Size; + + /** + * The ratio for converting shrunk screenshot coordinates to logical coordinates. + * + * Example: + * - Physical screen width: 3000px, dpr=6 + * - Logical width: 500px + * - User-defined screenshotShrinkFactor: 2 + * - Actual shrunk screenshot width: 3000 / 2 = 1500px + * - shrunkShotToLogicalRatio: dpr / screenshotShrinkFactor = 6 / 2 = 3 + * - To map back to logical coordinates: 1500 / shrunkShotToLogicalRatio = 500px + */ + abstract shrunkShotToLogicalRatio: number; abstract _isFrozen?: boolean; + + // @deprecated - backward compatibility for aiLocate + abstract deprecatedDpr?: number; } export type EnsureObject = { [K in keyof T]: any }; @@ -969,8 +991,6 @@ export interface WebElementInfo extends BaseElement { }; } -export type WebUIContext = UIContext; - /** * Agent */ @@ -1040,6 +1060,28 @@ export interface AgentOpt { */ useDeviceTimestamp?: boolean; + /** + * Custom screenshot shrink factor to reduce AI token usage. + * When set, the screenshot will be scaled down by this factor from the physical resolution. + * + * Example: + * - Physical screen width: 3000px, dpr=6 + * - Logical width: 500px + * - screenshotShrinkFactor: 2 + * - Actual shrunk screenshot width: 3000 / 2 = 1500px + * - AI analyzes the 1500px screenshot + * - Coordinates are transformed back to logical (500px) before actions execute + * + * Benefits: + * - Reduces token usage for high-resolution screenshots + * - Maintains accuracy by scaling coordinates appropriately + * + * Must be >= 1 (shrinking only, enlarging is not supported). + * + * @default 1 (no shrinking, uses original physical screenshot) + */ + screenshotShrinkFactor?: number; + /** * Custom OpenAI client factory function * diff --git a/packages/core/src/yaml.ts b/packages/core/src/yaml.ts index dda6599d51..7ccada675d 100644 --- a/packages/core/src/yaml.ts +++ b/packages/core/src/yaml.ts @@ -101,6 +101,7 @@ export type MidsceneYamlScriptAgentOpt = Pick< | 'aiActContext' | 'aiActionContext' | 'cache' + | 'screenshotShrinkFactor' >; export interface MidsceneYamlScriptConfig { diff --git a/packages/core/tests/evaluation.ts b/packages/core/tests/evaluation.ts index 734d7ea765..d267cc8ed2 100644 --- a/packages/core/tests/evaluation.ts +++ b/packages/core/tests/evaluation.ts @@ -15,7 +15,8 @@ export async function buildContext(targetDir: string): Promise { return { screenshot: ScreenshotItem.create(originalScreenshotBase64), - size, + shotSize: size, + shrunkShotToLogicalRatio: 1, }; } diff --git a/packages/core/tests/unit-test/aiaction-cacheable.test.ts b/packages/core/tests/unit-test/aiaction-cacheable.test.ts index 7679126a2c..7c156fb39c 100644 --- a/packages/core/tests/unit-test/aiaction-cacheable.test.ts +++ b/packages/core/tests/unit-test/aiaction-cacheable.test.ts @@ -41,7 +41,7 @@ describe('aiAction cacheable option propagation', () => { mockInterface = { interfaceType: 'web', screenshotBase64: vi.fn().mockResolvedValue(validBase64Image), - size: vi.fn().mockResolvedValue({ width: 1920, height: 1080, dpr: 1 }), + size: vi.fn().mockResolvedValue({ width: 1920, height: 1080 }), actionSpace: vi.fn().mockReturnValue([ { name: 'Click', @@ -61,7 +61,8 @@ describe('aiAction cacheable option propagation', () => { mockService = { contextRetrieverFn: vi.fn().mockImplementation(async () => ({ screenshot: ScreenshotItem.create(validBase64Image), - size: { width: 1920, height: 1080, dpr: 1 }, + shotSize: { width: 1920, height: 1080 }, + shrunkShotToLogicalRatio: 1, tree: { id: 'root', attributes: {}, diff --git a/packages/core/tests/unit-test/bbox-locate-cache.test.ts b/packages/core/tests/unit-test/bbox-locate-cache.test.ts index 9b6de0f34e..8c1416f2e3 100644 --- a/packages/core/tests/unit-test/bbox-locate-cache.test.ts +++ b/packages/core/tests/unit-test/bbox-locate-cache.test.ts @@ -44,10 +44,10 @@ function getTaskCacheInternal(taskCache: TaskCache): TaskCacheInternal { // Helper function to create mock UIContext with ScreenshotItem const createMockUIContext = async ( screenshotData: string, - size = { width: 1920, height: 1080, dpr: 1 }, + shotSize = { width: 1920, height: 1080 }, ) => { const screenshot = ScreenshotItem.create(screenshotData); - return { screenshot, size }; + return { screenshot, shotSize, shrunkShotToLogicalRatio: 1 }; }; describe('bbox locate cache fix', () => { @@ -73,7 +73,7 @@ describe('bbox locate cache fix', () => { mockInterface = { interfaceType: 'web', screenshotBase64: vi.fn().mockResolvedValue(validBase64Image), - size: vi.fn().mockResolvedValue({ width: 1920, height: 1080, dpr: 1 }), + size: vi.fn().mockResolvedValue({ width: 1920, height: 1080 }), actionSpace: vi.fn().mockReturnValue([ { name: 'Tap', @@ -104,7 +104,8 @@ describe('bbox locate cache fix', () => { const screenshot = ScreenshotItem.create(validBase64Image); return { screenshot, - size: { width: 1920, height: 1080, dpr: 1 }, + shotSize: { width: 1920, height: 1080 }, + shrunkShotToLogicalRatio: 1, tree: { id: 'root', attributes: {}, diff --git a/packages/web-integration/tests/unit-test/freeze-context.test.ts b/packages/core/tests/unit-test/freeze-context.test.ts similarity index 86% rename from packages/web-integration/tests/unit-test/freeze-context.test.ts rename to packages/core/tests/unit-test/freeze-context.test.ts index 4910d6d0d7..b55ac45ce8 100644 --- a/packages/web-integration/tests/unit-test/freeze-context.test.ts +++ b/packages/core/tests/unit-test/freeze-context.test.ts @@ -1,10 +1,8 @@ -import type { WebPage } from '@/web-element'; -import { WebPageContextParser } from '@/web-element'; -import { ScreenshotItem } from '@midscene/core'; -import { Agent as PageAgent } from '@midscene/core/agent'; -import { globalConfigManager } from '@midscene/shared/env'; +import { Agent as PageAgent, commonContextParser } from '@/agent'; +import type { AbstractInterface } from '@/device'; +import { ScreenshotItem } from '@/screenshot-item'; +import type { UIContext } from '@/types'; import { beforeEach, describe, expect, it, vi } from 'vitest'; -import type { WebUIContext } from '../../src'; // Mock page implementation const mockPage = { @@ -15,12 +13,14 @@ const mockPage = { actionSpace: vi.fn(() => []), screenshotBase64: vi.fn().mockResolvedValue('mock-screenshot'), evaluateJavaScript: vi.fn(), - size: vi.fn().mockResolvedValue({ width: 1920, height: 1080, dpr: 1 }), + size: vi.fn().mockResolvedValue({ width: 1920, height: 1080 }), url: vi.fn().mockResolvedValue('https://example.com'), - getContext: vi.fn().mockImplementation(async function (this: WebPage) { - return await WebPageContextParser(this, {}); + getContext: vi.fn().mockImplementation(async function ( + this: AbstractInterface, + ) { + return await commonContextParser(this, {}); }), -} as unknown as WebPage; +} as unknown as AbstractInterface; const mockedModelConfig = { MIDSCENE_MODEL_NAME: 'mock-model', @@ -30,15 +30,16 @@ const mockedModelConfig = { describe('PageAgent freeze/unfreeze page context', () => { let agent: PageAgent; - let mockContext: WebUIContext; - let mockContext2: WebUIContext; + let mockContext: UIContext; + let mockContext2: UIContext; beforeEach(async () => { vi.clearAllMocks(); // Create mock contexts mockContext = { - size: { width: 1920, height: 1080, dpr: 1 }, + shotSize: { width: 1920, height: 1080 }, + shrunkShotToLogicalRatio: 1, screenshot: ScreenshotItem.create( '', ), @@ -52,10 +53,11 @@ describe('PageAgent freeze/unfreeze page context', () => { attributes: {}, }, ], - } as unknown as WebUIContext; + } as unknown as UIContext; mockContext2 = { - size: { width: 1920, height: 1080, dpr: 1 }, + shotSize: { width: 1920, height: 1080 }, + shrunkShotToLogicalRatio: 1, screenshot: ScreenshotItem.create( '', ), @@ -69,9 +71,10 @@ describe('PageAgent freeze/unfreeze page context', () => { attributes: {}, }, ], - } as unknown as WebUIContext; + } as unknown as UIContext; // Create agent instance + // @ts-expect-error - access private property _id in test agent = new PageAgent(mockPage, { generateReport: false, autoPrintReportMsg: false, @@ -80,6 +83,7 @@ describe('PageAgent freeze/unfreeze page context', () => { // Mock _snapshotContext method to return different contexts on successive calls let callCount = 0; + // @ts-expect-error - access private property _id in test vi.spyOn(agent, '_snapshotContext').mockImplementation(async () => { callCount++; return callCount === 1 ? mockContext : mockContext2; @@ -148,7 +152,6 @@ describe('PageAgent freeze/unfreeze page context', () => { expect(screenshotData).toBe( '', ); - expect(frozenContext.tree).toBe(mockContext.tree); }); it('should preserve frozen flag across multiple operations', async () => { @@ -166,6 +169,7 @@ describe('PageAgent freeze/unfreeze page context', () => { describe('Context isolation and lifecycle', () => { it('should not share context between different agents', async () => { + // @ts-expect-error - access private property _id in test const agent2 = new PageAgent(mockPage, { generateReport: false, autoPrintReportMsg: false, @@ -173,6 +177,7 @@ describe('PageAgent freeze/unfreeze page context', () => { }); // Mock second agent's _snapshotContext + // @ts-expect-error - access private property _id in test vi.spyOn(agent2, '_snapshotContext').mockResolvedValue(mockContext2); // Freeze context for agent1 only @@ -236,11 +241,11 @@ describe('PageAgent freeze/unfreeze page context', () => { describe('getUIContext with frozen context', () => { it('should return frozen context for all actions when frozen', async () => { - // Mock WebPageContextParser to return a new context each time + // Mock commonContextParser to return a new context each time const mockParseContext = vi.fn().mockResolvedValue(mockContext2); vi.spyOn( - await import('@/web-element'), - 'WebPageContextParser', + await import('@/agent/utils'), + 'commonContextParser', ).mockImplementation(mockParseContext); // Freeze context @@ -258,17 +263,17 @@ describe('PageAgent freeze/unfreeze page context', () => { for (const action of actions) { const context = await agent.getUIContext(action); - // Should return the frozen context, not call WebPageContextParser + // Should return the frozen context, not call commonContextParser expect(context).toBe(mockContext); expect(context._isFrozen).toBe(true); } - // WebPageContextParser should not be called when frozen + // commonContextParser should not be called when frozen expect(mockParseContext).not.toHaveBeenCalled(); }); it('should return fresh context for all actions when not frozen', async () => { - // Mock WebPageContextParser + // Mock commonContextParser const mockParseContext = vi .fn() .mockResolvedValueOnce({ ...mockContext, fresh: 1 }) @@ -276,8 +281,8 @@ describe('PageAgent freeze/unfreeze page context', () => { .mockResolvedValueOnce({ ...mockContext, fresh: 3 }); vi.spyOn( - await import('@/web-element'), - 'WebPageContextParser', + await import('@/agent/utils'), + 'commonContextParser', ).mockImplementation(mockParseContext); // Test without freezing @@ -290,20 +295,20 @@ describe('PageAgent freeze/unfreeze page context', () => { expect((context2 as any).fresh).toBe(2); expect((context3 as any).fresh).toBe(3); - // WebPageContextParser should be called for each + // commonContextParser should be called for each expect(mockParseContext).toHaveBeenCalledTimes(3); }); it('should switch between frozen and fresh contexts correctly', async () => { - // Mock WebPageContextParser + // Mock commonContextParser const mockParseContext = vi .fn() .mockResolvedValueOnce({ ...mockContext2, callNumber: 1 }) .mockResolvedValueOnce({ ...mockContext2, callNumber: 2 }); vi.spyOn( - await import('@/web-element'), - 'WebPageContextParser', + await import('@/agent/utils'), + 'commonContextParser', ).mockImplementation(mockParseContext); // Get fresh context initially @@ -330,11 +335,11 @@ describe('PageAgent freeze/unfreeze page context', () => { }); it('should handle extract and assert actions correctly when frozen', async () => { - // Mock WebPageContextParser + // Mock commonContextParser const mockParseContext = vi.fn().mockResolvedValue(mockContext2); vi.spyOn( - await import('@/web-element'), - 'WebPageContextParser', + await import('@/agent/utils'), + 'commonContextParser', ).mockImplementation(mockParseContext); // Freeze context @@ -350,7 +355,7 @@ describe('PageAgent freeze/unfreeze page context', () => { expect(assertContext).toBe(mockContext); expect(assertContext._isFrozen).toBe(true); - // WebPageContextParser should not be called + // commonContextParser should not be called expect(mockParseContext).not.toHaveBeenCalled(); }); }); diff --git a/packages/core/tests/unit-test/report-generator.test.ts b/packages/core/tests/unit-test/report-generator.test.ts index 79eae342a9..f507100d2b 100644 --- a/packages/core/tests/unit-test/report-generator.test.ts +++ b/packages/core/tests/unit-test/report-generator.test.ts @@ -32,13 +32,15 @@ function fakeBase64(sizeBytes: number, format: 'png' | 'jpeg' = 'png'): string { */ function createDump(screenshots: ScreenshotItem[]): GroupedActionDump { const tasks = screenshots.map((s, i) => ({ + taskId: `task-${i}`, type: 'Insight' as const, subType: 'Locate', param: { prompt: `task-${i}` }, uiContext: { screenshot: s, - size: { width: 1920, height: 1080 }, - } as UIContext, + shotSize: { width: 1920, height: 1080 }, + shrunkShotToLogicalRatio: 1, + }, executor: async () => undefined, recorder: [], status: 'running' as const, @@ -553,13 +555,15 @@ describe('ReportGenerator — constant memory guarantees', () => { allScreenshots.push(screenshots); const tasks = screenshots.map((sc, i) => ({ + taskId: `${e}-${i}`, type: 'Insight' as const, subType: 'Locate', param: { prompt: `exec-${e}-task-${i}` }, uiContext: { screenshot: sc, - size: { width: 1920, height: 1080 }, - } as UIContext, + shotSize: { width: 1920, height: 1080 }, + shrunkShotToLogicalRatio: 1, + }, executor: async () => undefined, recorder: [], status: 'running' as const, diff --git a/packages/core/tests/unit-test/task-runner/index.test.ts b/packages/core/tests/unit-test/task-runner/index.test.ts index 0d0bb3269e..a7918567c9 100644 --- a/packages/core/tests/unit-test/task-runner/index.test.ts +++ b/packages/core/tests/unit-test/task-runner/index.test.ts @@ -65,7 +65,8 @@ const fakeUIContextBuilder = async () => { return { screenshot, tree: { node: null, children: [] }, - size: { width: 0, height: 0 }, + shotSize: { width: 0, height: 0 }, + shrunkShotToLogicalRatio: 1, } as unknown as UIContext; }; @@ -257,7 +258,8 @@ describe( return { screenshot, tree: { node: null, children: [] }, - size: { width: 0, height: 0 }, + shotSize: { width: 0, height: 0 }, + shrunkShotToLogicalRatio: 1, } as unknown as UIContext; }; @@ -304,7 +306,8 @@ describe( .mockResolvedValue({ screenshot: ScreenshotItem.create(''), tree: { node: null, children: [] }, - size: { width: 0, height: 0 }, + shotSize: { width: 0, height: 0 }, + shrunkShotToLogicalRatio: 1, } as unknown as UIContext); const runner = new TaskRunner('sub-task-error', uiContextBuilder, { diff --git a/packages/core/tests/unit-test/tasks-null-data.test.ts b/packages/core/tests/unit-test/tasks-null-data.test.ts index d7385fd1c1..46e4fcc888 100644 --- a/packages/core/tests/unit-test/tasks-null-data.test.ts +++ b/packages/core/tests/unit-test/tasks-null-data.test.ts @@ -9,7 +9,8 @@ const createMockUIContext = async (screenshotData = 'mock-screenshot') => { const screenshot = ScreenshotItem.create(screenshotData); return { screenshot, - size: { width: 1920, height: 1080 }, + shotSize: { width: 1920, height: 1080 }, + shrunkShotToLogicalRatio: 1, }; }; @@ -17,7 +18,8 @@ const createEmptyUIContext = async () => { const screenshot = ScreenshotItem.create(''); return { screenshot, - size: { width: 0, height: 0 }, + shotSize: { width: 0, height: 0 }, + shrunkShotToLogicalRatio: 1, }; }; @@ -151,7 +153,6 @@ describe('TaskExecutor - Null Data Handling', () => { modelName: 'mock-model', modelDescription: 'mock-model-description', intent: 'default', - from: 'legacy-env', }; const taskExecutor = new TaskExecutor({} as any, mockInsight, { @@ -196,7 +197,6 @@ describe('TaskExecutor - Null Data Handling', () => { modelName: 'mock-model', modelDescription: 'mock-model-description', intent: 'default', - from: 'legacy-env', }; const taskExecutor = new TaskExecutor({} as any, mockInsight, { @@ -237,7 +237,6 @@ describe('TaskExecutor - Null Data Handling', () => { modelName: 'mock-model', modelDescription: 'mock-model-description', intent: 'default', - from: 'legacy-env', }; const taskExecutor = new TaskExecutor({} as any, mockInsight, { @@ -276,7 +275,6 @@ describe('TaskExecutor - Null Data Handling', () => { modelName: 'mock-model', modelDescription: 'mock-model-description', intent: 'default', - from: 'legacy-env', }; const taskExecutor = new TaskExecutor({} as any, mockInsight, { @@ -317,7 +315,6 @@ describe('TaskExecutor - Null Data Handling', () => { modelName: 'mock-model', modelDescription: 'mock-model-description', intent: 'default', - from: 'legacy-env', }; const taskExecutor = new TaskExecutor({} as any, mockInsight, { diff --git a/packages/core/tests/unit-test/utils.test.ts b/packages/core/tests/unit-test/utils.test.ts index 42c38e22ee..914f06ec57 100644 --- a/packages/core/tests/unit-test/utils.test.ts +++ b/packages/core/tests/unit-test/utils.test.ts @@ -15,7 +15,12 @@ import { getMidsceneRunSubDir } from '@midscene/shared/common'; import { uuid } from '@midscene/shared/utils'; import { describe, expect, it } from 'vitest'; import { z } from 'zod'; -import { ifPlanLocateParamIsBbox } from '../../src/agent/utils'; +import { + ifPlanLocateParamIsBbox, + transformLogicalElementToScreenshot, + transformLogicalRectToScreenshotRect, + transformScreenshotElementToLogical, +} from '../../src/agent/utils'; import { getTmpDir, getTmpFile, @@ -794,3 +799,163 @@ describe('ifPlanLocateParamIsBbox', () => { expect(ifPlanLocateParamIsBbox(param)).toBe(false); }); }); + +describe('shrunkShotToLogicalRatio', () => { + it('transformLogicalElementToScreenshot with shrunkShotToLogicalRatio=1', () => { + expect( + transformLogicalElementToScreenshot( + { + description: 'test element', + center: [150, 250], + rect: { + left: 100, + top: 200, + width: 300, + height: 400, + dpr: 0, + }, + }, + 1, + ), + ).toStrictEqual({ + description: 'test element', + center: [150, 250], + rect: { + left: 100, + top: 200, + width: 300, + height: 400, + dpr: 0, + }, + }); + }); + + it('transformLogicalElementToScreenshot with shrunkShotToLogicalRatio=2', () => { + expect( + transformLogicalElementToScreenshot( + { + description: 'test element', + center: [150, 250], + rect: { + left: 100, + top: 200, + width: 300, + height: 400, + dpr: 0, + }, + }, + 2, + ), + ).toStrictEqual({ + description: 'test element', + center: [300, 500], + rect: { + left: 200, + top: 400, + width: 600, + height: 800, + dpr: 0, + }, + }); + }); + + it('transformLogicalRectToScreenshotRect with shrunkShotToLogicalRatio=1', () => { + expect( + transformLogicalRectToScreenshotRect( + { + left: 100, + top: 200, + width: 300, + height: 400, + dpr: 0, + }, + 1, + ), + ).toStrictEqual({ + left: 100, + top: 200, + width: 300, + height: 400, + dpr: 0, + }); + }); + + it('transformLogicalRectToScreenshotRect with shrunkShotToLogicalRatio=2', () => { + expect( + transformLogicalRectToScreenshotRect( + { + left: 100, + top: 200, + width: 300, + height: 400, + dpr: 0, + }, + 2, + ), + ).toStrictEqual({ + left: 200, + top: 400, + width: 600, + height: 800, + dpr: 0, + }); + }); + + it('transformScreenshotElementToLogical with shrunkShotToLogicalRatio=1', () => { + expect( + transformScreenshotElementToLogical( + { + description: 'test element', + center: [150, 250], + rect: { + left: 100, + top: 200, + width: 300, + height: 400, + dpr: 0, + }, + }, + 1, + ), + ).toStrictEqual({ + description: 'test element', + center: [150, 250], + rect: { + left: 100, + top: 200, + width: 300, + height: 400, + dpr: 0, + }, + }); + }); + + it('transformScreenshotElementToLogical with shrunkShotToLogicalRatio=2', () => { + expect( + transformScreenshotElementToLogical( + { + description: 'test element', + center: [150, 250], + rect: { + left: 100, + top: 200, + width: 300, + height: 400, + dpr: 0, + }, + }, + 2, + ), + ).toStrictEqual({ + description: 'test element', + center: [75, 125], + rect: { + left: 50, + top: 100, + width: 150, + height: 200, + dpr: 0, + }, + }); + }); +}); diff --git a/packages/core/tests/unit-test/vl-model-check.test.ts b/packages/core/tests/unit-test/vl-model-check.test.ts index d937a3a3fb..bf9dca7604 100644 --- a/packages/core/tests/unit-test/vl-model-check.test.ts +++ b/packages/core/tests/unit-test/vl-model-check.test.ts @@ -37,7 +37,7 @@ const createMockInterface = ( ({ interfaceType, destroy: vi.fn(), - size: vi.fn().mockResolvedValue({ dpr: 1 }), + size: vi.fn().mockResolvedValue({}), actionSpace: vi.fn(() => []), }) as unknown as AbstractInterface; diff --git a/packages/core/tests/utils.ts b/packages/core/tests/utils.ts index c98f278e4d..dc06d1e49e 100644 --- a/packages/core/tests/utils.ts +++ b/packages/core/tests/utils.ts @@ -33,6 +33,7 @@ export function createFakeContext(content?: string): UIContext { return { screenshot, - size: { width: 1920, height: 1080 }, + shotSize: { width: 1920, height: 1080 }, + shrunkShotToLogicalRatio: 1, }; } diff --git a/packages/evaluation/tests/llm-planning.test.ts b/packages/evaluation/tests/llm-planning.test.ts index 9261171977..9b62f1faec 100644 --- a/packages/evaluation/tests/llm-planning.test.ts +++ b/packages/evaluation/tests/llm-planning.test.ts @@ -8,10 +8,7 @@ import { defineActionTap, } from '@midscene/core/device'; import { sleep } from '@midscene/core/utils'; -import { - globalConfigManager, - globalModelConfigManager, -} from '@midscene/shared/env'; +import { globalModelConfigManager } from '@midscene/shared/env'; import { saveBase64Image } from '@midscene/shared/img'; import dotenv from 'dotenv'; import { afterEach, beforeAll, describe, expect, test } from 'vitest'; @@ -167,8 +164,8 @@ describe.skipIf(!globalModelFamily)('ai planning - by coordinates', () => { const indexId = index + 1; testCase.response_rect = adaptBboxToRect( res.action.locate.bbox, - context.size.width, - context.size.height, + context.shotSize.width, + context.shotSize.height, 0, 0, modelConfig.modelFamily, diff --git a/packages/evaluation/tests/util.ts b/packages/evaluation/tests/util.ts index 9a319f305e..35cc7b5c3b 100644 --- a/packages/evaluation/tests/util.ts +++ b/packages/evaluation/tests/util.ts @@ -1,12 +1,12 @@ import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs'; import path from 'node:path'; import type { PlanningAIResponse, Rect } from '@midscene/core'; +import { commonContextParser } from '@midscene/core/agent'; import { annotateRects, imageInfoOfBase64, localImg2Base64, } from '@midscene/shared/img'; -import { WebPageContextParser } from '@midscene/web'; export { annotateRects }; @@ -218,6 +218,6 @@ export async function buildContext(pageName: string) { }, }; - const context = await WebPageContextParser(fakePage as any, {}); + const context = await commonContextParser(fakePage as any, {}); return context; } diff --git a/packages/ios/src/device.ts b/packages/ios/src/device.ts index 4e927263c7..cb9159c28f 100644 --- a/packages/ios/src/device.ts +++ b/packages/ios/src/device.ts @@ -412,7 +412,6 @@ ScreenSize: ${size.width}x${size.height} (DPR: ${size.scale}) return { width: screenSize.width, height: screenSize.height, - dpr: screenSize.scale, }; } diff --git a/packages/ios/tests/unit-test/device.test.ts b/packages/ios/tests/unit-test/device.test.ts index 386e7b25d2..d02f795642 100644 --- a/packages/ios/tests/unit-test/device.test.ts +++ b/packages/ios/tests/unit-test/device.test.ts @@ -187,7 +187,6 @@ describe('IOSDevice', () => { expect(size).toEqual({ width: 375, height: 812, - dpr: 2, }); expect(mockWdaClient.getWindowSize).toHaveBeenCalled(); }); @@ -305,7 +304,6 @@ describe('IOSDevice', () => { expect(size).toEqual({ width: 375, height: 812, - dpr: 2, }); }); @@ -488,11 +486,6 @@ describe('IOSDevice', () => { await device.connect(); }); - it('should calculate DPR correctly', async () => { - const size = await device.size(); - expect(size.dpr).toBe(2); // DPR from mocked getScreenScale - }); - it('should handle different screen sizes', async () => { mockWdaClient.getWindowSize = vi .fn() diff --git a/packages/playground/src/types.ts b/packages/playground/src/types.ts index dffd25b022..094f8b3c56 100644 --- a/packages/playground/src/types.ts +++ b/packages/playground/src/types.ts @@ -1,4 +1,4 @@ -import type { DeviceAction, WebUIContext } from '@midscene/core'; +import type { DeviceAction } from '@midscene/core'; import type { Agent } from '@midscene/core/agent'; export interface PlaygroundAgent extends Agent { @@ -40,13 +40,6 @@ export interface ExecutionOptions { deviceOptions?: DeviceOptions; } -// Extended web types for playground - -export type PlaygroundWebUIContext = WebUIContext & { - screenshotBase64?: string; - size: { width: number; height: number; dpr?: number }; -}; - // SDK types - execution model based export type ExecutionType = 'local-execution' | 'remote-execution'; diff --git a/packages/shared/src/types/index.ts b/packages/shared/src/types/index.ts index b9766fd435..c88d549f24 100644 --- a/packages/shared/src/types/index.ts +++ b/packages/shared/src/types/index.ts @@ -9,10 +9,9 @@ export interface Point { export interface Size { width: number; // The image sent to AI model will be resized to this width, also the coordinates in the action space will be scaled to the range [0, width]. Usually you should set it to the logical pixel size height: number; // The image sent to AI model will be resized to this height, also the coordinates in the action space will be scaled to the range [0, height]. Usually you should set it to the logical pixel size - dpr?: number; // this is deprecated, do NOT use it } -export type Rect = Point & Size & { zoom?: number }; +export type Rect = Point & Size; export abstract class BaseElement { abstract id: string; diff --git a/packages/visualizer/src/component/blackboard/index.tsx b/packages/visualizer/src/component/blackboard/index.tsx index e4d1dbae05..e173ea6de3 100644 --- a/packages/visualizer/src/component/blackboard/index.tsx +++ b/packages/visualizer/src/component/blackboard/index.tsx @@ -89,7 +89,7 @@ export const Blackboard = (props: { const highlightPoints = props.highlightPoints; // Handle undefined/null uiContext - if (!props.uiContext?.size) { + if (!props.uiContext?.shotSize) { return (
@@ -100,7 +100,7 @@ export const Blackboard = (props: { } const context = props.uiContext; - const { size, screenshot } = context; + const { shotSize, screenshot } = context; // Extract base64 string from screenshot // After restoreImageReferences(), screenshot is { base64: string } @@ -113,8 +113,8 @@ export const Blackboard = (props: { return ''; }, [screenshot]); - const screenWidth = size.width; - const screenHeight = size.height; + const screenWidth = shotSize.width; + const screenHeight = shotSize.height; const domRef = useRef(null); // Should be HTMLDivElement not HTMLInputElement const app = useMemo(() => new PIXI.Application(), []); @@ -233,6 +233,7 @@ export const Blackboard = (props: { img.onerror = (e) => { console.error('load screenshot failed', e); }; + if (screenshotBase64) { img.src = screenshotBase64; } else { diff --git a/packages/visualizer/src/types.ts b/packages/visualizer/src/types.ts index 4d8016dd64..a1c524978e 100644 --- a/packages/visualizer/src/types.ts +++ b/packages/visualizer/src/types.ts @@ -222,12 +222,7 @@ export const extractDefaultValue = (field: ZodType): unknown => { return undefined; }; -import type { - ExecutionDump, - GroupedActionDump, - IExecutionDump, - WebUIContext, -} from '@midscene/core'; +import type { ExecutionDump, IExecutionDump } from '@midscene/core'; import type { ExecutionOptions, PlaygroundAgent } from '@midscene/playground'; // result type @@ -248,7 +243,7 @@ export interface PlaygroundProps { // static playground component props type export interface StaticPlaygroundProps { - context: WebUIContext | null; + context: UIContext | null; } // service mode type diff --git a/packages/visualizer/src/utils/playground-utils.ts b/packages/visualizer/src/utils/playground-utils.ts index cd72911ce5..6d14b2b858 100644 --- a/packages/visualizer/src/utils/playground-utils.ts +++ b/packages/visualizer/src/utils/playground-utils.ts @@ -1,4 +1,4 @@ -import type { WebUIContext } from '@midscene/core'; +import type { UIContext } from '@midscene/core'; import { StaticPage, StaticPageAgent } from '@midscene/web/static'; import type { ZodObjectSchema } from '../types'; import { isZodObjectSchema, unwrapZodType } from '../types'; @@ -31,7 +31,7 @@ export const actionNameForType = (type: string) => { }; // Create static agent from context -export const staticAgentFromContext = (context: WebUIContext) => { +export const staticAgentFromContext = (context: UIContext) => { const page = new StaticPage(context); return new StaticPageAgent(page); }; diff --git a/packages/visualizer/src/utils/replay-scripts.ts b/packages/visualizer/src/utils/replay-scripts.ts index c60e10a4ea..7bb3ee036e 100644 --- a/packages/visualizer/src/utils/replay-scripts.ts +++ b/packages/visualizer/src/utils/replay-scripts.ts @@ -183,9 +183,9 @@ export const allScriptsFromDump = ( normalizedDump.executions?.filter(Boolean).forEach((execution) => { execution.tasks.forEach((task) => { - if (task.uiContext?.size?.width) { - const w = task.uiContext.size.width; - const h = task.uiContext.size.height; + if (task.uiContext?.shotSize?.width) { + const w = task.uiContext.shotSize.width; + const h = task.uiContext.shotSize.height; if (!firstWidth) { firstWidth = w; firstHeight = h; @@ -409,8 +409,8 @@ export const generateAnimationScripts = ( const context = task.uiContext; if (context?.screenshot) { // show the original screenshot first - const width = context.size?.width || imageWidth; - const height = context.size?.height || imageHeight; + const width = context.shotSize?.width || imageWidth; + const height = context.shotSize?.height || imageHeight; const screenshotData = ( context.screenshot as unknown as { base64: string } ).base64; @@ -452,8 +452,8 @@ export const generateAnimationScripts = ( insightCameraDuration: locateDuration, title, subTitle: element.description || subTitle, - imageWidth: context.size?.width || imageWidth, - imageHeight: context.size?.height || imageHeight, + imageWidth: context.shotSize?.width || imageWidth, + imageHeight: context.shotSize?.height || imageHeight, taskId: currentTaskId, }); @@ -480,8 +480,8 @@ export const generateAnimationScripts = ( duration: stillDuration, title: typeStr(task), subTitle: paramStr(task), - imageWidth: task.uiContext?.size?.width || imageWidth, - imageHeight: task.uiContext?.size?.height || imageHeight, + imageWidth: task.uiContext?.shotSize?.width || imageWidth, + imageHeight: task.uiContext?.shotSize?.height || imageHeight, taskId: currentTaskId, }); } @@ -525,8 +525,8 @@ export const generateAnimationScripts = ( camera: task.subType === 'Sleep' ? fullPageCameraState : undefined, title, subTitle, - imageWidth: task.uiContext?.size?.width || imageWidth, - imageHeight: task.uiContext?.size?.height || imageHeight, + imageWidth: task.uiContext?.shotSize?.width || imageWidth, + imageHeight: task.uiContext?.shotSize?.height || imageHeight, taskId: currentTaskId, }); } else { @@ -545,8 +545,8 @@ export const generateAnimationScripts = ( camera: fullPageCameraState, title, subTitle, - imageWidth: task.uiContext?.size?.width || imageWidth, - imageHeight: task.uiContext?.size?.height || imageHeight, + imageWidth: task.uiContext?.shotSize?.width || imageWidth, + imageHeight: task.uiContext?.shotSize?.height || imageHeight, taskId: currentTaskId, }); } @@ -569,8 +569,8 @@ export const generateAnimationScripts = ( duration: stillDuration, title: errorTitle, subTitle: errorSubTitle, - imageWidth: task.uiContext?.size?.width || imageWidth, - imageHeight: task.uiContext?.size?.height || imageHeight, + imageWidth: task.uiContext?.shotSize?.width || imageWidth, + imageHeight: task.uiContext?.shotSize?.height || imageHeight, taskId: currentTaskId, }); } diff --git a/packages/web-integration/src/bin.ts b/packages/web-integration/src/bin.ts index ded77cfc7a..0ee9afd3e8 100644 --- a/packages/web-integration/src/bin.ts +++ b/packages/web-integration/src/bin.ts @@ -2,13 +2,15 @@ import { PlaygroundServer } from '@midscene/playground'; import cors from 'cors'; import { StaticPage, StaticPageAgent } from './static'; import 'dotenv/config'; +import { ScreenshotItem } from '@midscene/core'; async function startServer() { // Create page and agent instances with minimal valid data // Use screenshotBase64 field for empty screenshot const page = new StaticPage({ - size: { width: 800, height: 600 }, - screenshotBase64: '', + shotSize: { width: 800, height: 600 }, + screenshot: ScreenshotItem.create(''), + shrunkShotToLogicalRatio: 1, }); const agent = new StaticPageAgent(page); diff --git a/packages/web-integration/src/bridge-mode/agent-cli-side.ts b/packages/web-integration/src/bridge-mode/agent-cli-side.ts index 93545b0e11..8108568271 100644 --- a/packages/web-integration/src/bridge-mode/agent-cli-side.ts +++ b/packages/web-integration/src/bridge-mode/agent-cli-side.ts @@ -64,10 +64,6 @@ export const getBridgePageInCliSide = (options?: { }; } - if (prop === 'getContext') { - return undefined; - } - if (prop === 'interfaceType') { return BridgePageType; } diff --git a/packages/web-integration/src/chrome-extension/page.ts b/packages/web-integration/src/chrome-extension/page.ts index 14ae1a2cd8..350b12b37a 100644 --- a/packages/web-integration/src/chrome-extension/page.ts +++ b/packages/web-integration/src/chrome-extension/page.ts @@ -12,7 +12,6 @@ import type { Point, Rect, Size, - UIContext, } from '@midscene/core'; import type { AbstractInterface, DeviceAction } from '@midscene/core/device'; import type { ElementInfo } from '@midscene/shared/extractor'; @@ -28,7 +27,6 @@ import { judgeOrderSensitive, sanitizeXpaths, } from '../common/cache-helper'; -import { WebPageContextParser } from '../web-element'; import { type KeyInput, type MouseButton, @@ -318,7 +316,6 @@ export default class ChromeExtensionProxyPage implements AbstractInterface { size: { width: document.documentElement.clientWidth, height: document.documentElement.clientHeight, - dpr: window.devicePixelRatio, }, }; }; @@ -446,7 +443,7 @@ export default class ChromeExtensionProxyPage implements AbstractInterface { try { const elementInfo = await this.getElementInfoByXpath(xpath); if (elementInfo?.rect) { - return buildRectFromElementInfo(elementInfo, this.viewportSize?.dpr); + return buildRectFromElementInfo(elementInfo); } } catch (error) { debug('rectMatchesCacheFeature failed for xpath %s: %O', xpath, error); @@ -468,16 +465,11 @@ export default class ChromeExtensionProxyPage implements AbstractInterface { return content?.tree || { node: null, children: [] }; } - async getContext(): Promise { - return await WebPageContextParser(this, {}); - } - async size() { if (this.viewportSize) return this.viewportSize; const result = await this.sendCommandToDebugger('Runtime.evaluate', { - expression: - '({width: window.innerWidth, height: window.innerHeight, dpr: window.devicePixelRatio})', + expression: '({width: window.innerWidth, height: window.innerHeight})', returnByValue: true, }); diff --git a/packages/web-integration/src/common/cache-helper.ts b/packages/web-integration/src/common/cache-helper.ts index 4a12bbc6d8..8f3931a64f 100644 --- a/packages/web-integration/src/common/cache-helper.ts +++ b/packages/web-integration/src/common/cache-helper.ts @@ -55,20 +55,14 @@ export async function judgeOrderSensitive( } // Shared logic to build Rect from elementInfo -export function buildRectFromElementInfo( - elementInfo: { - rect: { left: number; top: number; width: number; height: number }; - }, - dpr?: number, -): Rect { +export function buildRectFromElementInfo(elementInfo: { + rect: { left: number; top: number; width: number; height: number }; +}): Rect { const matchedRect: Rect = { left: elementInfo.rect.left, top: elementInfo.rect.top, width: elementInfo.rect.width, height: elementInfo.rect.height, }; - if (dpr) { - matchedRect.dpr = dpr; - } return matchedRect; } diff --git a/packages/web-integration/src/index.ts b/packages/web-integration/src/index.ts index 0c006fad06..81d93048e2 100644 --- a/packages/web-integration/src/index.ts +++ b/packages/web-integration/src/index.ts @@ -1,10 +1,7 @@ export { PlaywrightAiFixture } from './playwright'; export type { PlayWrightAiFixtureType } from './playwright'; -export type { WebPage } from './web-element'; -export type { WebUIContext } from '@midscene/core'; export { Agent as PageAgent, type AgentOpt } from '@midscene/core/agent'; export { PuppeteerAgent } from './puppeteer'; export { PlaywrightAgent } from './playwright'; export { StaticPageAgent, StaticPage } from './static'; -export { WebPageContextParser } from './web-element'; diff --git a/packages/web-integration/src/mcp-tools-puppeteer.ts b/packages/web-integration/src/mcp-tools-puppeteer.ts index d679093852..56c676dc3a 100644 --- a/packages/web-integration/src/mcp-tools-puppeteer.ts +++ b/packages/web-integration/src/mcp-tools-puppeteer.ts @@ -3,7 +3,7 @@ import { existsSync } from 'node:fs'; import { mkdir, readFile, unlink, writeFile } from 'node:fs/promises'; import { tmpdir } from 'node:os'; import { join } from 'node:path'; -import { z } from '@midscene/core'; +import { ScreenshotItem, z } from '@midscene/core'; import { BaseMidsceneTools, type ToolDefinition } from '@midscene/shared/mcp'; import type { Page as PuppeteerPage } from 'puppeteer'; import puppeteer from 'puppeteer-core'; @@ -156,8 +156,9 @@ const browserManager = { export class WebPuppeteerMidsceneTools extends BaseMidsceneTools { protected createTemporaryDevice() { return new StaticPage({ - screenshotBase64: '', - size: { width: 1920, height: 1080 }, + screenshot: ScreenshotItem.create(''), + shotSize: { width: 1920, height: 1080 }, + shrunkShotToLogicalRatio: 1, }); } diff --git a/packages/web-integration/src/mcp-tools.ts b/packages/web-integration/src/mcp-tools.ts index e66d3eb014..f79595a40f 100644 --- a/packages/web-integration/src/mcp-tools.ts +++ b/packages/web-integration/src/mcp-tools.ts @@ -1,4 +1,4 @@ -import { z } from '@midscene/core'; +import { ScreenshotItem, z } from '@midscene/core'; import { BaseMidsceneTools, type ToolDefinition } from '@midscene/shared/mcp'; import { AgentOverChromeBridge } from './bridge-mode'; import { StaticPage } from './static'; @@ -12,8 +12,9 @@ export class WebMidsceneTools extends BaseMidsceneTools { // StaticPage.actionSpace() returns DeviceAction[] which is compatible at runtime // Use screenshotBase64 field to avoid async ScreenshotItem.create() return new StaticPage({ - screenshotBase64: '', - size: { width: 1920, height: 1080 }, + screenshot: ScreenshotItem.create(''), + shotSize: { width: 1920, height: 1080 }, + shrunkShotToLogicalRatio: 1, }); } diff --git a/packages/web-integration/src/puppeteer/base-page.ts b/packages/web-integration/src/puppeteer/base-page.ts index 4a3fc4091c..82cdaebf01 100644 --- a/packages/web-integration/src/puppeteer/base-page.ts +++ b/packages/web-integration/src/puppeteer/base-page.ts @@ -1,4 +1,4 @@ -import { type WebPageAgentOpt, WebPageContextParser } from '@/web-element'; +import type { WebPageAgentOpt } from '@/web-element'; import type { DeviceAction, ElementCacheFeature, @@ -6,7 +6,6 @@ import type { Point, Rect, Size, - UIContext, } from '@midscene/core'; import type { AbstractInterface } from '@midscene/core/device'; import { sleep } from '@midscene/core/utils'; @@ -224,7 +223,7 @@ export class Page< 'rectMatchesCacheFeature: found element, rect: %o', elementInfo.rect, ); - return buildRectFromElementInfo(elementInfo, this.viewportSize?.dpr); + return buildRectFromElementInfo(elementInfo); } debugPage( 'rectMatchesCacheFeature: element found but no rect (elementInfo: %o)', @@ -264,7 +263,6 @@ export class Page< return { width: window.innerWidth, height: window.innerHeight, - dpr: window.devicePixelRatio, }; }); this.viewportSize = sizeInfo; @@ -560,9 +558,6 @@ export class Page< async destroy(): Promise {} - async getContext(): Promise { - return await WebPageContextParser(this, {}); - } async swipe( from: { x: number; y: number }, to: { x: number; y: number }, diff --git a/packages/web-integration/src/static/static-page.ts b/packages/web-integration/src/static/static-page.ts index 341944e171..27c59097b6 100644 --- a/packages/web-integration/src/static/static-page.ts +++ b/packages/web-integration/src/static/static-page.ts @@ -1,6 +1,5 @@ import type { DeviceAction, Point, UIContext } from '@midscene/core'; import type { AbstractInterface } from '@midscene/core/device'; -import { ScreenshotItem } from '@midscene/core'; import { defineActionDragAndDrop, defineActionHover, @@ -13,10 +12,6 @@ import { } from '@midscene/core/device'; import { ERROR_CODE_NOT_IMPLEMENTED_AS_DESIGNED } from '@midscene/shared/common'; -type WebUIContext = UIContext | { - screenshotBase64?: string; - size: { width: number; height: number; dpr?: number }; -}; const ThrowNotImplemented = (methodName: string) => { throw new Error( @@ -24,12 +19,14 @@ const ThrowNotImplemented = (methodName: string) => { ); }; +type StaticPageUIContext = Omit; + export default class StaticPage implements AbstractInterface { interfaceType = 'static'; - private uiContext: WebUIContext; + private uiContext: StaticPageUIContext; - constructor(uiContext: WebUIContext) { + constructor(uiContext: StaticPageUIContext) { this.uiContext = uiContext; } @@ -87,29 +84,16 @@ export default class StaticPage implements AbstractInterface { async size() { return { - ...this.uiContext.size, - dpr: this.uiContext.size.dpr || 1, + ...this.uiContext.shotSize }; } async screenshotBase64() { - // Check if this is a UIContext with screenshot property - if ('screenshot' in this.uiContext && this.uiContext.screenshot) { - const screenshot = this.uiContext.screenshot; - if (typeof screenshot === 'object' && 'base64' in screenshot) { - return (screenshot as { base64: string }).base64; - } - return screenshot as unknown as string; - } - - // Check legacy screenshotBase64 field - const legacyContext = this.uiContext as { screenshotBase64?: string }; - const base64 = legacyContext.screenshotBase64; - - if (!base64) { - throw new Error('screenshot base64 is empty'); + const screenshot = this.uiContext.screenshot; + if (typeof screenshot === 'object' && 'base64' in screenshot) { + return (screenshot as { base64: string }).base64; } - return base64; + return screenshot as unknown as string; } async url() { @@ -168,24 +152,7 @@ export default class StaticPage implements AbstractInterface { // } - async getContext(): Promise { - // If the context already has a screenshot property, return it as-is - if ('screenshot' in this.uiContext && this.uiContext.screenshot) { - return this.uiContext as UIContext; - } - - // Otherwise, create a proper UIContext from the legacy format - const screenshotBase64 = await this.screenshotBase64(); - const screenshot = ScreenshotItem.create(screenshotBase64); - const size = await this.size(); - - return { - screenshot, - size, - }; - } - - updateContext(newContext: WebUIContext): void { + updateContext(newContext: StaticPageUIContext): void { this.uiContext = newContext; } } diff --git a/packages/web-integration/src/web-element.ts b/packages/web-integration/src/web-element.ts index be998f4563..bd2f58494f 100644 --- a/packages/web-integration/src/web-element.ts +++ b/packages/web-integration/src/web-element.ts @@ -2,19 +2,11 @@ import type { AgentOpt, DeviceAction, Rect, - UIContext, WebElementInfo, } from '@midscene/core'; -import type { AbstractInterface } from '@midscene/core/device'; -import { getDebug } from '@midscene/shared/logger'; import { _keyDefinitions } from '@midscene/shared/us-keyboard-layout'; -import { commonContextParser } from '@midscene/core/agent'; import type { NodeType } from '@midscene/shared/constants'; -import type ChromeExtensionProxyPage from './chrome-extension/page'; -import type { PlaywrightWebPage } from './playwright'; -import type { PuppeteerWebPage } from './puppeteer'; -import type { StaticPage } from './static'; export type { WebElementInfo }; export type WebPageAgentOpt = AgentOpt & WebPageOpt; @@ -37,12 +29,6 @@ export type WebPageOpt = { customActions?: DeviceAction[]; }; -export type WebPage = - | PlaywrightWebPage - | PuppeteerWebPage - | StaticPage - | ChromeExtensionProxyPage; - export class WebElementInfoImpl implements WebElementInfo { content: string; @@ -97,36 +83,6 @@ export class WebElementInfoImpl implements WebElementInfo { } } -const debug = getDebug('web:parse-context'); -export async function WebPageContextParser( - page: AbstractInterface, - _opt: { uploadServerUrl?: string }, -): Promise { - const basicContext = await commonContextParser(page, { - uploadServerUrl: _opt.uploadServerUrl, - }); - - // debug('will traverse element tree'); - // const tree = (await page.getElementsNodeTree?.()) || { - // node: null, - // children: [], - // }; - // // const webTree = traverseTree(tree!, (elementInfo) => { - // // const { rect, id, content, attributes, indexId, isVisible } = elementInfo; - // // return new WebElementInfoImpl({ - // // rect, - // // id, - // // content, - // // attributes, - // // indexId, - // // isVisible, - // // }); - // // }); - // debug('traverse element tree end'); - - return basicContext; -} - export const limitOpenNewTabScript = ` if (!window.__MIDSCENE_NEW_TAB_INTERCEPTOR_INITIALIZED__) { window.__MIDSCENE_NEW_TAB_INTERCEPTOR_INITIALIZED__ = true; diff --git a/packages/web-integration/tests/ai/fixtures/ui-context.json b/packages/web-integration/tests/ai/fixtures/ui-context.json index 7bc15cdd55..a37e14de76 100644 --- a/packages/web-integration/tests/ai/fixtures/ui-context.json +++ b/packages/web-integration/tests/ai/fixtures/ui-context.json @@ -1069,11 +1069,12 @@ } ] }, - "size": { + "shotSize": { "width": 1440, "height": 900, "dpr": 2 }, + "shrunkShotToLogicalRatio": 1, "screenshotBase64": "", "screenshotBase64WithElementMarker": "", "url": "https://www.saucedemo.com/" diff --git a/packages/web-integration/tests/ai/web/puppeteer/screenshot-shrink-factor.test.ts b/packages/web-integration/tests/ai/web/puppeteer/screenshot-shrink-factor.test.ts new file mode 100644 index 0000000000..c8fb9e1663 --- /dev/null +++ b/packages/web-integration/tests/ai/web/puppeteer/screenshot-shrink-factor.test.ts @@ -0,0 +1,70 @@ +import path from 'node:path'; +import { PuppeteerAgent } from '@/puppeteer'; +import { sleep } from '@midscene/core/utils'; +import { afterEach, describe, expect, it, vi } from 'vitest'; +import { launchPage } from './utils'; + +const pageContent = ` + + + + Test Page + + + + + + + +`; + +describe('screenshotShrinkFactor', () => { + it('no-shrink', async () => { + const { originPage, reset } = await launchPage('about:blank', { + viewport: { + height: 800, + width: 600, + deviceScaleFactor: 2, + }, + }); + await originPage.setContent(pageContent); + + const agent = new PuppeteerAgent(originPage, {}); + + await agent.aiTap('button foo'); + + await agent.aiAssert('the button text is "bar"'); + + await reset(); + }); + + it('shrink-2', async () => { + const { originPage, reset } = await launchPage('about:blank', { + viewport: { + height: 800, + width: 600, + deviceScaleFactor: 2, + }, + }); + await originPage.setContent(pageContent); + + const agent = new PuppeteerAgent(originPage, { + screenshotShrinkFactor: 2, + }); + + await agent.aiTap('button foo'); + + await agent.aiAssert('the button text is "bar"'); + + await reset(); + }); +}); diff --git a/packages/web-integration/tests/ai/web/static/static-page.test.ts b/packages/web-integration/tests/ai/web/static/static-page.test.ts index 94e16addbc..c14e5db8bc 100644 --- a/packages/web-integration/tests/ai/web/static/static-page.test.ts +++ b/packages/web-integration/tests/ai/web/static/static-page.test.ts @@ -3,11 +3,14 @@ import { join } from 'node:path'; import { StaticPageAgent, StaticPage } from '@midscene/web/static'; import { PlaygroundServer } from '@midscene/playground'; import { afterEach, describe, expect, it } from 'vitest'; +import { ScreenshotItem } from '@midscene/core'; const dumpFilePath = join(__dirname, '../../fixtures/ui-context.json'); const context = readFileSync(dumpFilePath, { encoding: 'utf-8' }); const contextJson = JSON.parse(context); +contextJson.screenshot = contextJson.screenshotBase64; + describe( 'static page agent', { @@ -36,8 +39,9 @@ describe( it('server should work', async () => { const page = new StaticPage({ - size: { width: 800, height: 600 }, - screenshotBase64: '', + shotSize: { width: 800, height: 600 }, + shrunkShotToLogicalRatio: 1, + screenshot: ScreenshotItem.create(''), }); const agent = new StaticPageAgent(page); server = new PlaygroundServer(agent); diff --git a/packages/web-integration/tests/unit-test/__snapshots__/web-extractor.test.ts.snap b/packages/web-integration/tests/unit-test/__snapshots__/web-extractor.test.ts.snap index 42b8df6ab5..f069a6a03b 100644 --- a/packages/web-integration/tests/unit-test/__snapshots__/web-extractor.test.ts.snap +++ b/packages/web-integration/tests/unit-test/__snapshots__/web-extractor.test.ts.snap @@ -1,1880 +1,5 @@ // Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html -exports[`extractor > basic 1`] = ` -[ - { - "attributes": { - "htmlTagName": "

", - "nodeType": "TEXT Node", - }, - "content": "Data Record", - }, - { - "attributes": { - "htmlTagName": "

", - "nodeType": "TEXT Node", - }, - "content": "1970-01-01 19:25:01", - }, - { - "attributes": { - "htmlTagName": "

", - "nodeType": "TEXT Node", - }, - "content": "User Name: Stella", - }, - { - "attributes": { - "htmlTagName": "", - "nodeType": "TEXT Node", - }, - "content": "ID", - }, - { - "attributes": { - "htmlTagName": "", - "nodeType": "TEXT Node", - }, - "content": "Field 2", - }, - { - "attributes": { - "htmlTagName": "", - "nodeType": "TEXT Node", - }, - "content": "Field 3", - }, - { - "attributes": { - "htmlTagName": "", - "nodeType": "TEXT Node", - }, - "content": "Field 4", - }, - { - "attributes": { - "htmlTagName": "", - "nodeType": "TEXT Node", - }, - "content": "Field 5", - }, - { - "attributes": { - "htmlTagName": "", - "nodeType": "TEXT Node", - }, - "content": "30S", - }, - { - "attributes": { - "htmlTagName": "", - "nodeType": "TEXT Node", - }, - "content": "Kace Cervantes", - }, - { - "attributes": { - "htmlTagName": "", - "nodeType": "TEXT Node", - }, - "content": "Aylin Sawyer", - }, - { - "attributes": { - "htmlTagName": "", - "nodeType": "TEXT Node", - }, - "content": "Jefferson Kirby", - }, - { - "attributes": { - "htmlTagName": "", - "nodeType": "TEXT Node", - }, - "content": "Skyla Jefferson", - }, - { - "attributes": { - "htmlTagName": "", - "nodeType": "TEXT Node", - }, - "content": "70U", - }, - { - "attributes": { - "htmlTagName": "", - "nodeType": "TEXT Node", - }, - "content": "Florence Davenport", - }, - { - "attributes": { - "htmlTagName": "", - "nodeType": "TEXT Node", - }, - "content": "Dariel Acevedo", - }, - { - "attributes": { - "htmlTagName": "", - "nodeType": "TEXT Node", - }, - "content": "Ashlynn Delacruz", - }, - { - "attributes": { - "htmlTagName": "", - "nodeType": "TEXT Node", - }, - "content": "Memphis Leal", - }, - { - "attributes": { - "htmlTagName": "", - "nodeType": "TEXT Node", - }, - "content": "3AY", - }, - { - "attributes": { - "htmlTagName": "", - "nodeType": "TEXT Node", - }, - "content": "Crystal Newman", - }, - { - "attributes": { - "htmlTagName": "", - "nodeType": "TEXT Node", - }, - "content": "Anderson Brown", - }, - { - "attributes": { - "htmlTagName": "", - "nodeType": "TEXT Node", - }, - "content": "Charlotte Griffith", - }, - { - "attributes": { - "htmlTagName": "", - "nodeType": "TEXT Node", - }, - "content": "Franklin Everett", - }, - { - "attributes": { - "htmlTagName": "", - "nodeType": "TEXT Node", - }, - "content": "YPG", - }, - { - "attributes": { - "htmlTagName": "", - "nodeType": "TEXT Node", - }, - "content": "Kori Payne", - }, - { - "attributes": { - "htmlTagName": "", - "nodeType": "TEXT Node", - }, - "content": "Edward Blevins", - }, - { - "attributes": { - "htmlTagName": "", - "nodeType": "TEXT Node", - }, - "content": "Aila Gill", - }, - { - "attributes": { - "htmlTagName": "", - "nodeType": "TEXT Node", - }, - "content": "Matthias Reed", - }, - { - "attributes": { - "htmlTagName": "", - "nodeType": "TEXT Node", - }, - "content": "ZEN", - }, - { - "attributes": { - "htmlTagName": "", - "nodeType": "TEXT Node", - }, - "content": "Magnolia Duke", - }, - { - "attributes": { - "htmlTagName": "", - "nodeType": "TEXT Node", - }, - "content": "Kalel Glover", - }, - { - "attributes": { - "htmlTagName": "", - "nodeType": "TEXT Node", - }, - "content": "Alessia Barton", - }, - { - "attributes": { - "htmlTagName": "", - "nodeType": "TEXT Node", - }, - "content": "Cassius Peck", - }, - { - "attributes": { - "htmlTagName": "

", - "nodeType": "TEXT Node", - }, - "content": "Form", - }, - { - "attributes": { - "for": "name", - "htmlTagName": "