实现思路
osk模型进行输入语音转换
txt字典导航程序路径
pyttsx3引擎进行语音打印输出
关键词=程序路径
完整代码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import os import json import queue import sounddevice as sd from vosk import Model, KaldiRecognizer import subprocess import time import pyttsx3 import threading
# 初始化 pyttsx3 引擎 engine = pyttsx3.init() engine.setProperty('rate', 150) # 设置语速 engine.setProperty('volume', 1.0) # 设置音量
# 加载Vosk模型 model_path = r"D:\daku\yuyinshibie\vosk-model-small-cn-0.22" if not os.path.exists(model_path): print(f"模型路径不存在: {model_path}") engine.say(f"模型路径不存在: {model_path}") engine.runAndWait() exit(1)
# 读取字典文件,格式为 "命令=程序路径" def load_app_dict(file_path): app_dict = {} if not os.path.exists(file_path): print(f"字典文件不存在: {file_path}") engine.say(f"字典文件不存在: {file_path}") engine.runAndWait() return app_dict
with open(file_path, 'r', encoding='utf-8') as file: for line in file: parts = line.strip().split('=') if len(parts) == 2: keys, value = parts # 处理可能存在的别名情况,例如 "微信,weixin" for key in keys.split(','): app_dict[key.strip()] = value.strip() return app_dict
# 启动应用程序 def launch_application(app_name, app_dict): if app_name in app_dict: app_path = app_dict[app_name] response = f"正在启动 {app_name}..." say(response) subprocess.Popen(app_path) time.sleep(2) # 等待2秒再继续监听 else: response = f"找不到与 '{app_name}' 对应的应用程序。" say(response)
# 定义一个函数用于语音输出,并在说的时候暂停监听 def say(text): global stream, callback_func if stream is not None: with stream_lock: stream.callback = None # 移除回调函数以暂停监听 stream.stop() # 暂停音频流 engine.say(text) engine.runAndWait() if stream is not None: with stream_lock: stream.start() # 恢复音频流 stream.callback = callback_func # 重新设置回调函数
# 初始化模型和识别器 model = Model(model_path) rec = KaldiRecognizer(model, 16000)
q = queue.Queue()
last_partial_result = "" last_full_command = ""
stream_lock = threading.Lock() stream = None callback_func = None
def callback(indata, frames, time, status): if status: print(status, file=sys.stderr) q.put(bytes(indata))
# 主程序 if __name__ == "__main__": dict_file = r"D:\daku\yuyinshibie\zidian.txt" # 字典文件路径 app_dict = load_app_dict(dict_file)
try: # 提前初始化音频流 callback_func = callback stream = sd.RawInputStream(samplerate=16000, blocksize=8000, dtype='int16', channels=1, callback=callback) stream.start()
say("请说:") while True: data = q.get() if rec.AcceptWaveform(data): result = json.loads(rec.Result()) command = result['text'].strip() if command and command != last_full_command: print(f"你说的是: {command}") say(f"你说的是: {command}") if "打开" in command: app_to_open = command.replace("打开", "").strip() launch_application(app_to_open, app_dict) last_full_command = command elif rec.PartialResult(): partial_result = json.loads(rec.PartialResult())['partial'] if partial_result and "打开" in partial_result and partial_result != last_partial_result: print(f"部分结果: {partial_result}") say(f"部分结果: {partial_result}") last_partial_result = partial_result except KeyboardInterrupt: say("\n退出程序。") finally: if stream is not None: stream.stop() stream.close() |
关键词部分,为了识别准确以及出现谐音内容可以增添多个关键词使用,作为分割
字典路径如果出现中文字符有可能会报错!
代码意义不大,如果考虑深入:可以尝试增加快捷键,以及相关应用接口可以更好控制
上班族打开电脑i第一件事情是启动相关应用,同样可以尝试多应用编组启动