主题切换
附录 E:元素定位与视觉识别
并非所有目标应用都提供 API。当需要与没有开放接口的软件交互时,视觉识别是最可靠的方案。本附录讲解图像匹配、坐标转换和等待策略,让你的 RPA 应用真正具备"看见"屏幕并找到目标的能力。
E.1 图像模板匹配
图像模板匹配是在屏幕上查找与目标图片相似区域的技术,相当于人眼在屏幕上"寻找"某个按钮或图标。
使用 opencv4nodejs 进行模板匹配
typescript
import cv from '@u4/opencv4nodejs'
import { screen, Region } from '@nut-tree/nut-js'
class TemplateMatcher {
private confidenceThreshold = 0.8
/**
* 在屏幕上查找模板图像的位置
* @param templatePath 模板图像路径
* @returns 匹配到的区域数组,按置信度排序
*/
async findOnScreen(templatePath: string): Promise<MatchResult[]> {
// 1. 截取全屏
const w = await screen.width()
const h = await screen.height()
const screenshot = await screen.capture(new Region(0, 0, w, h))
const screenshotPath = path.join(os.tmpdir(), `screen-${Date.now()}.png`)
await screenshot.toFile(screenshotPath)
// 2. 使用 OpenCV 进行模板匹配
const haystack = cv.imread(screenshotPath)
const needle = cv.imread(templatePath)
const matched = haystack.matchTemplate(needle, cv.TM_CCOEFF_NORMED)
const minMax = matched.minMaxLoc()
// 清理临时文件
await fs.promises.unlink(screenshotPath)
// 3. 阈值过滤
if (minMax.maxVal < this.confidenceThreshold) {
return []
}
return [{
x: minMax.maxLoc.x,
y: minMax.maxLoc.y,
width: needle.cols,
height: needle.rows,
confidence: minMax.maxVal
}]
}
/**
* 查找所有匹配(非最大值抑制)
*/
async findAllOnScreen(
templatePath: string,
maxResults = 10
): Promise<MatchResult[]> {
const w = await screen.width()
const h = await screen.height()
const screenshot = await screen.capture(new Region(0, 0, w, h))
const screenshotPath = path.join(os.tmpdir(), `screen-${Date.now()}.png`)
await screenshot.toFile(screenshotPath)
const haystack = cv.imread(screenshotPath)
const needle = cv.imread(templatePath)
const matched = haystack.matchTemplate(needle, cv.TM_CCOEFF_NORMED)
await fs.promises.unlink(screenshotPath)
const results: MatchResult[] = []
const mask = new cv.Mat(haystack.rows, haystack.cols, cv.CV_8UC1)
mask.setTo(new cv.Vec3(1, 1, 1))
for (let i = 0; i < maxResults; i++) {
const minMax = matched.minMaxLoc(mask)
if (minMax.maxVal < this.confidenceThreshold) break
results.push({
x: minMax.maxLoc.x,
y: minMax.maxLoc.y,
width: needle.cols,
height: needle.rows,
confidence: minMax.maxVal
})
// 在已找到的区域周围屏蔽,避免重复检测
const suppressRadius = Math.max(needle.cols, needle.rows) / 2
cv.circle(
mask,
new cv.Point2(minMax.maxLoc.x + needle.cols / 2, minMax.maxLoc.y + needle.rows / 2),
suppressRadius,
new cv.Vec3(0, 0, 0),
-1
)
}
return results
}
}
interface MatchResult {
x: number
y: number
width: number
height: number
confidence: number
}纯 JavaScript 方案(使用 sharp)
如果不想引入 OpenCV,可以用 sharp 实现简化的模板匹配:
typescript
import sharp from 'sharp'
import pixelmatch from 'pixelmatch'
class SharpTemplateMatcher {
/**
* 基于像素差异的简单匹配(适合完全一致的图标)
*
* ⚠️ 性能警告:此方法使用 O(W*H*w*h) 复杂度的滑动窗口搜索,
* 仅适合小范围或低分辨率截图。大尺寸图像请使用 OpenCV 方案。
*/
async exactMatch(
screenshotPath: string,
templatePath: string
): Promise<MatchResult | null> {
const screenshot = sharp(screenshotPath).raw()
const template = sharp(templatePath).raw()
const { data: screenData, info: screenInfo } = await screenshot.toBuffer({ resolveWithObject: true })
const { data: tmplData, info: tmplInfo } = await template.toBuffer({ resolveWithObject: true })
// 滑动窗口搜索(性能较低,仅适合小范围)
for (let y = 0; y <= screenInfo.height - tmplInfo.height; y += 2) {
for (let x = 0; x <= screenInfo.width - tmplInfo.width; x += 2) {
const diff = this.compareRegion(
screenData, screenInfo.width, screenInfo.height,
tmplData, tmplInfo.width, tmplInfo.height,
x, y
)
if (diff < 0.01) { // 差异小于 1%
return { x, y, width: tmplInfo.width, height: tmplInfo.height, confidence: 1 - diff }
}
}
}
return null
}
}模板图像准备建议
• 截取目标元素时尽量去除背景干扰 • 保存为 PNG 格式避免压缩失真 • 同一按钮准备 2-3 个不同分辨率的模板以适配 DPI 变化 • 避免包含动态内容(如时间、用户名)在模板中
E.2 多分辨率与 DPI 适配
不同显示器有不同的缩放比例(DPI),同一按钮在 100% 和 150% 缩放下像素尺寸完全不同。RPA 应用必须能够在这种差异中准确定位。
物理像素 vs 逻辑像素
typescript
import { screen as electronScreen } from 'electron'
interface DisplayInfo {
id: number
scaleFactor: number
physicalSize: { width: number; height: number }
logicalSize: { width: number; height: number }
}
function getDisplayInfo(): DisplayInfo[] {
return electronScreen.getAllDisplays().map(d => ({
id: d.id,
scaleFactor: d.scaleFactor,
physicalSize: { width: d.size.width, height: d.size.height },
logicalSize: {
width: d.size.width / d.scaleFactor,
height: d.size.height / d.scaleFactor
}
}))
}
/**
* 将逻辑坐标转换为物理坐标(用于 nut.js 操作)
*/
function logicalToPhysical(
x: number,
y: number,
scaleFactor: number
): { x: number; y: number } {
return {
x: Math.round(x * scaleFactor),
y: Math.round(y * scaleFactor)
}
}
/**
* 将物理坐标转换为逻辑坐标(用于存储和比对)
*/
function physicalToLogical(
x: number,
y: number,
scaleFactor: number
): { x: number; y: number } {
return {
x: x / scaleFactor,
y: y / scaleFactor
}
}多分辨率模板策略
typescript
class MultiScaleTemplateLibrary {
private templates: Map<string, Map<number, string>> = new Map()
/**
* 注册模板的多分辨率版本
*/
registerTemplate(name: string, basePath: string, scales: number[]): void {
const scaleMap = new Map<number, string>()
for (const scale of scales) {
const path = basePath.replace('.png', `@${scale}x.png`)
scaleMap.set(scale, path)
}
this.templates.set(name, scaleMap)
}
/**
* 获取当前显示器最合适的模板
*/
getBestTemplate(name: string, displayId: number): string | null {
const scaleMap = this.templates.get(name)
if (!scaleMap) return null
const display = electronScreen.getAllDisplays().find(d => d.id === displayId)
if (!display) return null
// 找到最接近的 scale
let bestScale = 1
let minDiff = Infinity
for (const scale of scaleMap.keys()) {
const diff = Math.abs(scale - display.scaleFactor)
if (diff < minDiff) {
minDiff = diff
bestScale = scale
}
}
return scaleMap.get(bestScale) || null
}
/**
* 动态生成缩放模板
*/
async generateScaledTemplate(
basePath: string,
targetScale: number
): Promise<string> {
const outputPath = basePath.replace('.png', `@${targetScale}x.png`)
await sharp(basePath)
.resize({
width: Math.round((await sharp(basePath).metadata()).width! * targetScale),
height: Math.round((await sharp(basePath).metadata()).height! * targetScale),
kernel: sharp.kernel.lanczos3 // 高质量缩放
})
.toFile(outputPath)
return outputPath
}
}E.3 等待策略
自动化流程中经常需要等待某个条件满足后再继续执行(如等待弹窗出现、等待按钮可点击)。
轮询等待与超时
typescript
class WaitStrategies {
private defaultTimeout = 10000 // 默认 10 秒
private defaultInterval = 500 // 默认每 500ms 检查一次
/**
* 等待条件满足
*/
async waitFor(
condition: () => Promise<boolean>,
options: {
timeout?: number
interval?: interval
message?: string
} = {}
): Promise<void> {
const {
timeout = this.defaultTimeout,
interval = this.defaultInterval,
message = '等待条件满足'
} = options
const startTime = Date.now()
while (Date.now() - startTime < timeout) {
if (await condition()) return
await this.sleep(interval)
}
throw new Error(`${message} 超时(${timeout}ms)`)
}
/**
* 等待模板出现在屏幕上
*/
async waitForTemplate(
templatePath: string,
options: WaitOptions = {}
): Promise<MatchResult> {
const matcher = new TemplateMatcher()
return this.waitFor(async () => {
const results = await matcher.findOnScreen(templatePath)
return results.length > 0
}, {
...options,
message: `等待模板 ${path.basename(templatePath)} 出现在屏幕上`
}).then(() => matcher.findOnScreen(templatePath).then(r => r[0]))
}
/**
* 等待模板从屏幕上消失
*/
async waitForTemplateGone(
templatePath: string,
options: WaitOptions = {}
): Promise<void> {
const matcher = new TemplateMatcher()
return this.waitFor(async () => {
const results = await matcher.findOnScreen(templatePath)
return results.length === 0
}, {
...options,
message: `等待模板 ${path.basename(templatePath)} 从屏幕上消失`
})
}
private sleep(ms: number): Promise<void> {
return new Promise(resolve => setTimeout(resolve, ms))
}
}
interface WaitOptions {
timeout?: number
interval?: number
message?: string
}智能等待策略
除了固定间隔的轮询,还可以实现更智能的等待:
typescript
class SmartWait extends WaitStrategies {
/**
* 指数退避等待:间隔逐渐增长,减少 CPU 占用
*/
async waitForWithBackoff(
condition: () => Promise<boolean>,
options: {
timeout?: number
baseInterval?: number
maxInterval?: number
} = {}
): Promise<void> {
const {
timeout = 30000,
baseInterval = 100,
maxInterval = 2000
} = options
const startTime = Date.now()
let attempt = 0
while (Date.now() - startTime < timeout) {
if (await condition()) return
// 指数增长间隔
const interval = Math.min(
baseInterval * Math.pow(1.5, attempt),
maxInterval
)
attempt++
await this.sleep(interval)
}
throw new Error(`智能等待超时(${timeout}ms)`)
}
/**
* 等待屏幕稳定(连续 N 次截图一致)
* 适用于等待加载动画结束
*/
async waitForStableScreen(
options: { timeout?: number; stableDuration?: number } = {}
): Promise<void> {
const { timeout = 30000, stableDuration = 1000 } = options
const startTime = Date.now()
let lastHash: string | null = null
let stableStart: number | null = null
while (Date.now() - startTime < timeout) {
const currentHash = await this.captureScreenHash()
if (currentHash === lastHash) {
if (stableStart && Date.now() - stableStart >= stableDuration) {
return // 屏幕已稳定
}
if (!stableStart) stableStart = Date.now()
} else {
stableStart = null
lastHash = currentHash
}
await this.sleep(200)
}
throw new Error(`等待屏幕稳定超时(${timeout}ms)`)
}
private async captureScreenHash(): Promise<string> {
const { createHash } = require('crypto')
const w = await screen.width()
const h = await screen.height()
const cap = await screen.capture(new Region(0, 0, w, h))
// 简化:取屏幕中心 100x100 区域的哈希
// 实际实现可能需要更复杂的感知哈希(pHash)
return createHash('md5').update(cap.data).digest('hex')
}
}E.4 视觉定位实战
案例:在桌面上找到应用图标并打开
typescript
async function openApplicationByIcon(iconTemplatePath: string): Promise<void> {
const matcher = new TemplateMatcher()
const mouse = require('@nut-tree/nut-js').mouse
const { left, straightTo, Point } = require('@nut-tree/nut-js')
// 1. 截取桌面(最小化所有窗口或切换到桌面)
await keyboard.pressKey(Key.LeftCommand, Key.D) // Windows: Win+D
await keyboard.releaseKey(Key.LeftCommand, Key.D)
await sleep(500)
// 2. 等待桌面稳定
await smartWait.waitForStableScreen({ stableDuration: 500 })
// 3. 查找图标
const matches = await matcher.findOnScreen(iconTemplatePath)
if (matches.length === 0) {
throw new Error('未在桌面上找到目标应用图标')
}
// 4. 双击打开
const target = matches[0]
const centerX = target.x + target.width / 2
const centerY = target.y + target.height / 2
await mouse.move(straightTo(new Point(centerX, centerY)))
await mouse.doubleClick(left)
// 5. 等待应用窗口出现
await waitStrategies.waitFor(async () => {
const windows = await getWindows()
return windows.some(w => w.title.includes('目标应用名'))
}, { timeout: 15000 })
}案例:等待弹窗并点击确定
typescript
async function waitAndClickDialog(
dialogTemplatePath: string,
buttonTemplatePath: string
): Promise<void> {
const matcher = new TemplateMatcher()
const mouse = require('@nut-tree/nut-js').mouse
const { left, straightTo, Point } = require('@nut-tree/nut-js')
// 1. 等待弹窗出现
const dialogMatch = await waitStrategies.waitForTemplate(dialogTemplatePath, {
timeout: 20000,
message: '等待操作结果弹窗'
})
// 2. 在弹窗区域内查找按钮(缩小搜索范围提升速度)
const buttonRegion = new Region(
dialogMatch.x,
dialogMatch.y,
dialogMatch.width,
dialogMatch.height
)
const screenshot = await screen.capture(buttonRegion)
const regionPath = path.join(os.tmpdir(), 'dialog-region.png')
await screenshot.toFile(regionPath)
// 3. 匹配按钮
const buttons = await matcher.findAllOnScreen(buttonTemplatePath)
if (buttons.length === 0) {
throw new Error('在弹窗中未找到目标按钮')
}
// 4. 点击第一个匹配的按钮
const btn = buttons[0]
await mouse.move(straightTo(new Point(
dialogMatch.x + btn.x + btn.width / 2,
dialogMatch.y + btn.y + btn.height / 2
)))
await mouse.click(left)
}案例:多显示器环境下的跨屏幕定位
typescript
async function findAcrossMonitors(templatePath: string): Promise<MatchResult> {
const displays = electronScreen.getAllDisplays()
const matcher = new TemplateMatcher()
for (const display of displays) {
// 仅截取当前显示器区域
const region = new Region(
display.bounds.x,
display.bounds.y,
display.bounds.width,
display.bounds.height
)
const screenshot = await screen.capture(region)
const tmpPath = path.join(os.tmpdir(), `monitor-${display.id}.png`)
await screenshot.toFile(tmpPath)
// 在该显示器截图上执行模板匹配
const matches = await matcher.findInImage(tmpPath, templatePath)
await fs.promises.unlink(tmpPath)
if (matches.length > 0) {
// 转换坐标为全局坐标
return {
...matches[0],
x: matches[0].x + display.bounds.x,
y: matches[0].y + display.bounds.y
}
}
}
throw new Error('在所有显示器上均未找到目标')
}E.5 Tesseract.js OCR 文字识别
当图像模板匹配无法满足需求时(如动态文本、不同语言的按钮),OCR 文字识别是重要的补充手段。Tesseract.js 是 Tesseract OCR 引擎的纯 JavaScript 移植版。
安装与初始化
bash
npm install tesseract.jstypescript
import Tesseract, { createWorker } from 'tesseract.js'
import { screen, Region } from '@nut-tree/nut-js'
import fs from 'fs'
import path from 'path'
import os from 'os'
class OCREngine {
private worker: Tesseract.Worker | null = null
/**
* 初始化 OCR Worker(需提前下载语言包)
* @param language 识别语言,默认中文简体 + 英文
*/
async initialize(language = 'chi_sim+eng'): Promise<void> {
this.worker = await createWorker(language)
// 配置识别参数
await this.worker.setParameters({
tessedit_pageseg_mode: Tesseract.PSM.AUTO, // 自动页面分割
tessedit_char_whitelist: '', // 空 = 不限制字符集
})
console.log('OCR 引擎已就绪,语言:', language)
}
/**
* 识别屏幕指定区域的文字
*/
async recognizeRegion(
region: { left: number; top: number; width: number; height: number }
): Promise<OCRResult[]> {
if (!this.worker) throw new Error('OCR 引擎未初始化,请先调用 initialize()')
// 截取指定区域
const screenshot = await screen.capture(
new Region(region.left, region.top, region.width, region.height)
)
const tmpPath = path.join(os.tmpdir(), `ocr-${Date.now()}.png`)
await screenshot.toFile(tmpPath)
// 执行 OCR 识别
const { data } = await this.worker.recognize(tmpPath)
// 清理临时文件
await fs.promises.unlink(tmpPath)
// 解析结果
return data.words
.filter(w => w.confidence > 60) // 过滤低置信度结果
.map(w => ({
text: w.text,
confidence: w.confidence,
bbox: {
x: region.left + w.bbox.x0,
y: region.top + w.bbox.y0,
width: w.bbox.x1 - w.bbox.x0,
height: w.bbox.y1 - w.bbox.y0,
}
}))
}
/**
* 在屏幕上查找包含指定文字的区域的坐标
* 适用于定位文本标签、按钮文字等
*/
async findTextOnScreen(
searchText: string,
options: {
region?: { left: number; top: number; width: number; height: number }
caseSensitive?: boolean
} = {}
): Promise<{ x: number; y: number; width: number; height: number } | null> {
const screenW = await screen.width()
const screenH = await screen.height()
const searchRegion = options.region || {
left: 0, top: 0, width: screenW, height: screenH
}
const results = await this.recognizeRegion(searchRegion)
const target = options.caseSensitive
? results.find(r => r.text.includes(searchText))
: results.find(r => r.text.toLowerCase().includes(searchText.toLowerCase()))
if (!target) return null
// 返回文字区域中心坐标
return {
x: target.bbox.x + target.bbox.width / 2,
y: target.bbox.y + target.bbox.height / 2,
width: target.bbox.width,
height: target.bbox.height,
}
}
/**
* 销毁 Worker,释放资源
*/
async destroy(): Promise<void> {
await this.worker?.terminate()
this.worker = null
}
}
interface OCRResult {
text: string
confidence: number
bbox: { x: number; y: number; width: number; height: number }
}下载中文语言包
Tesseract.js 在首次调用 createWorker('chi_sim') 时会自动下载中文简体语言包(约 10MB)。你也可以提前下载以加快首次使用:
typescript
// 在应用启动时预加载语言包
import { createWorker } from 'tesseract.js'
async function preloadOCRLanguages(): Promise<void> {
const languages = ['chi_sim', 'eng']
for (const lang of languages) {
const worker = await createWorker(lang)
console.log(`OCR 语言包已加载: ${lang}`)
await worker.terminate()
}
}实战案例:从截图中提取并点击文字
typescript
import { mouse, straightTo, Point, left } from '@nut-tree/nut-js'
async function clickOnText(text: string): Promise<boolean> {
const ocr = new OCREngine()
await ocr.initialize('chi_sim+eng')
const pos = await ocr.findTextOnScreen(text)
if (!pos) {
console.log(`未找到文字: "${text}"`)
await ocr.destroy()
return false
}
// 移动鼠标到文字中心并点击
await mouse.move(straightTo(new Point(pos.x, pos.y)))
await mouse.click(left)
console.log(`已点击文字: "${text}" at (${pos.x}, ${pos.y})`)
await ocr.destroy()
return true
}OCR 使用建议
- 缩小识别区域:不要全屏 OCR,尽量通过粗略位置缩小到按钮/菜单区域,大幅提升速度和准确率
- 预处理图像:对截图进行灰度化、二值化、放大处理可显著提高识别率
- 语言包按需加载:如果只识别数字或英文,使用
'eng'即可,避免加载大型中文包 - 缓存 Worker 实例:不要每次识别都创建新的 Worker,复用实例更高效
- 置信度阈值:中文 OCR 建议
confidence > 60,英文可设为confidence > 50
性能优化建议
视觉识别是计算密集型操作。生产环境中建议: • 缩小搜索区域(基于上一次位置或元素层级关系) • 降低截图分辨率进行初步匹配 • 对已知位置的元素使用相对坐标而非每次都做图像匹配 • 缓存匹配结果,在屏幕未变化时复用