commit 68acc666fc2f878266bc1d96e14a1e3dace57e76 Author: hkr04 Date: Mon Dec 2 11:43:41 2024 +0800 first commit diff --git a/API_KEY.py b/API_KEY.py new file mode 100644 index 0000000..b5319f5 --- /dev/null +++ b/API_KEY.py @@ -0,0 +1,15 @@ +# API_KEY.py +# 同济子豪兄 2024-5-22 +# 各种开放平台的KEY,不要外传 + +# 零一万物大模型开放平台 +# https://platform.lingyiwanwu.com +YI_KEY = "f8144ffaff7c459791XXXXXXXXX" + +# 百度智能云千帆ModelBuilder +# https://qianfan.cloud.baidu.com +QIANFAN_ACCESS_KEY = "ALTAKRELRxSXXXXXXXXXX" +QIANFAN_SECRET_KEY = "3737d9da82de4f2XXXXXXXXXX" + +# 百度智能云千帆AppBuilder-SDK +APPBUILDER_TOKEN = "bce-v3/ALTAK-7jr20xkZl4cDmhbQKA4ml/f560e5dc3XXXXXXX059XXXXXXXXX" diff --git a/README.ipynb b/README.ipynb new file mode 100644 index 0000000..8eb6776 --- /dev/null +++ b/README.ipynb @@ -0,0 +1,121 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c8a0febf-1001-4a87-b873-06bc1471187c", + "metadata": {}, + "source": [ + "# 语音控制智能体\n", + "\n", + "同济子豪兄 2024-5-23" + ] + }, + { + "cell_type": "markdown", + "id": "bb2091f1-1d00-40bc-9432-9d7cd3d9157e", + "metadata": {}, + "source": [ + "## 首先要做\n", + "\n", + "- 音频输出选择HDMI显示屏\n", + "\n", + "- 找到麦克风设备号\n", + "\n", + "- 手眼标定" + ] + }, + { + "cell_type": "markdown", + "id": "e2c144d1-059c-40d1-b69b-8485cb6686c5", + "metadata": {}, + "source": [ + "# 智能体Agent能够调用的函数" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "1b933878-c06f-426d-8ca3-d4b5ddade0ac", + "metadata": {}, + "outputs": [], + "source": [ + "# 函数一:归零\n", + "# back_zero()\n", + "\n", + "# 函数二:放松机械臂\n", + "# relax_arms()\n", + "\n", + "# 函数三:摇头\n", + "# head_shake()\n", + "\n", + "# 函数四:点头\n", + "# head_nod()\n", + "\n", + "# 函数五:跳舞\n", + "# head_dance()\n", + "\n", + "# 函数六:开启吸泵\n", + "# pump_on()\n", + "\n", + "# 函数七:关闭吸泵\n", + "# pump_off()\n", + "\n", + "# 函数八:移动到指定坐标\n", + "# move_to_coords(X=150, Y=-120)\n", + "\n", + "# 函数九:指定关节旋转\n", + "# single_joint_move(1, 60)\n", + "\n", + "# 函数十:移动至俯视姿态\n", + "# move_to_top_view()\n", + "\n", + "# 函数十一:拍一张俯视图\n", + "# top_view_shot()\n", + "\n", + "# 函数十二:开启摄像头\n", + "# check_camera()\n", + "\n", + "# 函数十三:LED灯变颜色\n", + "# llm_led('帮我把LED灯的颜色改为贝加尔湖的颜色')\n", + "\n", + "# 函数十四:移动物体\n", + "# vlm_move(PROMPT='帮我把红色方块放在小猪佩奇上')\n", + "\n", + "# 函数十五:拖动示教\n", + "# drag_teach()\n", + "\n", + "# 函数十六:休息等待\n", + "# time.sleep()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1d0c3be-3080-4543-a943-adb10e19e79b", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/agent_go.py b/agent_go.py new file mode 100644 index 0000000..f64936b --- /dev/null +++ b/agent_go.py @@ -0,0 +1,75 @@ +# agent_go.py +# 同济子豪兄 2024-5-27 +# 看懂“图像”、听懂“人话”、指哪打哪的机械臂 +# 机械臂+大模型+多模态+语音识别=具身智能体Agent + +print('\n听得懂人话、看得懂图像、拎得清动作的具身智能机械臂!') +print('同济子豪兄 2024-5-27 \n') + +# 导入常用函数 +from utils_asr import * # 录音+语音识别 +from utils_robot import * # 连接机械臂 +from utils_llm import * # 大语言模型API +from utils_led import * # 控制LED灯颜色 +from utils_camera import * # 摄像头 +from utils_robot import * # 机械臂运动 +from utils_pump import * # GPIO、吸泵 +from utils_vlm_move import * # 多模态大模型识别图像,吸泵吸取并移动物体 +from utils_drag_teaching import * # 拖动示教 +from utils_agent import * # 智能体Agent编排 +from utils_tts import * # 语音合成模块 + +# print('播放欢迎词') +pump_off() +# back_zero() +play_wav('asset/welcome.wav') + + +def agent_play(): + ''' + 主函数,语音控制机械臂智能体编排动作 + ''' + # 归零 + back_zero() + + # print('测试摄像头') + # check_camera() + + # 输入指令 + # 先回到原点,再把LED灯改为墨绿色,然后把绿色方块放在篮球上 + start_record_ok = input('是否开启录音,输入数字录音指定时长,按k打字输入,按c输入默认指令\n') + if str.isnumeric(start_record_ok): + DURATION = int(start_record_ok) + record(DURATION=DURATION) # 录音 + order = speech_recognition() # 语音识别 + elif start_record_ok == 'k': + order = input('请输入指令') + elif start_record_ok == 'c': + order = '先归零,再摇头,然后把绿色方块放在篮球上' + else: + print('无指令,退出') + # exit() + raise NameError('无指令,退出') + + # 智能体Agent编排动作 + agent_plan_output = eval(agent_plan(order)) + + print('智能体编排动作如下\n', agent_plan_output) + # plan_ok = input('是否继续?按c继续,按q退出') + plan_ok = 'c' + if plan_ok == 'c': + response = agent_plan_output['response'] # 获取机器人想对我说的话 + print('开始语音合成') + tts(response) # 语音合成,导出wav音频文件 + play_wav('temp/tts.wav') # 播放语音合成音频文件 + for each in agent_plan_output['function']: # 运行智能体规划编排的每个函数 + print('开始执行动作', each) + eval(each) + elif plan_ok =='q': + # exit() + raise NameError('按q退出') + +# agent_play() +if __name__ == '__main__': + agent_play() + diff --git a/asset/SimHei.ttf b/asset/SimHei.ttf new file mode 100644 index 0000000..0875c08 Binary files /dev/null and b/asset/SimHei.ttf differ diff --git a/asset/welcome.wav b/asset/welcome.wav new file mode 100644 index 0000000..d8589fa Binary files /dev/null and b/asset/welcome.wav differ diff --git a/camera_check.py b/camera_check.py new file mode 100644 index 0000000..827b131 --- /dev/null +++ b/camera_check.py @@ -0,0 +1,20 @@ +# camera_check.py +# 调用摄像头实时画面,按q键退出 +# 同济子豪兄 2024-5-13 + +import cv2 +import numpy as np + +cap = cv2.VideoCapture(0) + +while(True): + ret, frame = cap.read() + + # gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) + + cv2.imshow('frame', frame) + if cv2.waitKey(1) & 0xFF == ord('q'): + break + +cap.release() +cv2.destroyAllWindows() \ No newline at end of file diff --git a/sound_check.py b/sound_check.py new file mode 100644 index 0000000..cdb9fe1 --- /dev/null +++ b/sound_check.py @@ -0,0 +1,16 @@ +# sound_check.py +# 快速检查语音相关的所有功能:麦克风、录音、扬声器播放声音、语音识别、语音合成 +# 同济子豪兄 2024-7-15 + +from utils_asr import * # 录音+语音识别 +from utils_tts import * # 语音合成模块 +print('开始录音5秒') +record(DURATION=5) # 录音 +print('播放录音') +play_wav('temp/speech_record.wav') +speech_result = speech_recognition() +print('开始语音合成') +tts(speech_result) +print('播放语音合成音频') +play_wav('temp/tts.wav') + diff --git a/temp/record.txt b/temp/record.txt new file mode 100644 index 0000000..b6a41a5 --- /dev/null +++ b/temp/record.txt @@ -0,0 +1,842 @@ +[ + [ + 2158, + 2046, + 1786, + 1747, + 1807, + 1326 + ], + [ + 2158, + 2046, + 1786, + 1744, + 1808, + 1326 + ], + [ + 2158, + 2047, + 1786, + 1743, + 1808, + 1326 + ], + [ + 2158, + 2051, + 1781, + 1730, + 1808, + 1326 + ], + [ + 2158, + 2051, + 1780, + 1711, + 1808, + 1326 + ], + [ + 2158, + 2061, + 1780, + 1683, + 1808, + 1326 + ], + [ + 2158, + 2104, + 1780, + 1623, + 1808, + 1326 + ], + [ + 2158, + 2153, + 1780, + 1562, + 1816, + 1326 + ], + [ + 2158, + 2185, + 1780, + 1522, + 1819, + 1326 + ], + [ + 2158, + 2221, + 1780, + 1492, + 1820, + 1326 + ], + [ + 2159, + 2269, + 1780, + 1454, + 1820, + 1326 + ], + [ + 2159, + 2324, + 1780, + 1406, + 1820, + 1326 + ], + [ + 2160, + 2382, + 1780, + 1358, + 1820, + 1326 + ], + [ + 2160, + 2442, + 1782, + 1309, + 1819, + 1326 + ], + [ + 2162, + 2488, + 1780, + 1251, + 1817, + 1326 + ], + [ + 2160, + 2538, + 1780, + 1204, + 1814, + 1326 + ], + [ + 2159, + 2596, + 1784, + 1176, + 1808, + 1326 + ], + [ + 2163, + 2638, + 1780, + 1128, + 1814, + 1324 + ], + [ + 2160, + 2718, + 1780, + 1044, + 1855, + 1300 + ], + [ + 2159, + 2781, + 1780, + 990, + 1926, + 1251 + ], + [ + 2162, + 2821, + 1782, + 989, + 1958, + 1176 + ], + [ + 2163, + 2844, + 1780, + 992, + 1970, + 1131 + ], + [ + 2163, + 2874, + 1783, + 979, + 1983, + 1097 + ], + [ + 2163, + 2904, + 1784, + 941, + 2010, + 1061 + ], + [ + 2163, + 2912, + 1785, + 932, + 2023, + 1055 + ], + [ + 2163, + 2913, + 1808, + 948, + 2025, + 1055 + ], + [ + 2163, + 2936, + 1901, + 1038, + 2025, + 1055 + ], + [ + 2163, + 2988, + 2051, + 1131, + 2028, + 1055 + ], + [ + 2164, + 3039, + 2210, + 1247, + 2030, + 1055 + ], + [ + 2188, + 3076, + 2352, + 1365, + 2022, + 1051 + ], + [ + 2229, + 3100, + 2468, + 1467, + 1990, + 1030 + ], + [ + 2253, + 3118, + 2559, + 1543, + 1979, + 1007 + ], + [ + 2260, + 3139, + 2632, + 1584, + 1980, + 1006 + ], + [ + 2260, + 3162, + 2705, + 1613, + 1985, + 1006 + ], + [ + 2273, + 3189, + 2786, + 1658, + 1990, + 1006 + ], + [ + 2313, + 3215, + 2868, + 1715, + 1969, + 1006 + ], + [ + 2362, + 3233, + 2949, + 1786, + 1953, + 1006 + ], + [ + 2422, + 3247, + 3025, + 1845, + 1922, + 1006 + ], + [ + 2473, + 3265, + 3095, + 1868, + 1876, + 1006 + ], + [ + 2516, + 3293, + 3168, + 1874, + 1855, + 1006 + ], + [ + 2552, + 3332, + 3252, + 1902, + 1842, + 1006 + ], + [ + 2591, + 3372, + 3335, + 1938, + 1842, + 1006 + ], + [ + 2635, + 3397, + 3389, + 1969, + 1912, + 1006 + ], + [ + 2694, + 3401, + 3401, + 2025, + 2010, + 1006 + ], + [ + 2756, + 3400, + 3401, + 2051, + 2000, + 1006 + ], + [ + 2808, + 3397, + 3401, + 2054, + 2011, + 1006 + ], + [ + 2855, + 3396, + 3401, + 2039, + 2044, + 1006 + ], + [ + 2896, + 3396, + 3401, + 2003, + 2062, + 1006 + ], + [ + 2930, + 3396, + 3401, + 1982, + 2090, + 1006 + ], + [ + 2971, + 3396, + 3401, + 1980, + 2109, + 1006 + ], + [ + 3004, + 3395, + 3401, + 1994, + 2110, + 1006 + ], + [ + 3005, + 3355, + 3398, + 2067, + 2110, + 1006 + ], + [ + 3004, + 3246, + 3385, + 2248, + 2107, + 1006 + ], + [ + 3004, + 3118, + 3320, + 2402, + 2103, + 1006 + ], + [ + 2995, + 3026, + 3209, + 2386, + 2103, + 1006 + ], + [ + 2979, + 2953, + 3081, + 2351, + 2104, + 999 + ], + [ + 2978, + 2889, + 2925, + 2312, + 2077, + 999 + ], + [ + 2962, + 2835, + 2746, + 2207, + 2053, + 999 + ], + [ + 2939, + 2774, + 2557, + 2037, + 2006, + 999 + ], + [ + 2904, + 2716, + 2372, + 1904, + 1966, + 999 + ], + [ + 2867, + 2658, + 2221, + 1829, + 1951, + 999 + ], + [ + 2808, + 2604, + 2130, + 1790, + 1952, + 999 + ], + [ + 2714, + 2583, + 2115, + 1800, + 2007, + 999 + ], + [ + 2610, + 2544, + 2114, + 1905, + 2063, + 993 + ], + [ + 2552, + 2497, + 2088, + 2001, + 2095, + 894 + ], + [ + 2546, + 2435, + 1960, + 1971, + 2055, + 802 + ], + [ + 2541, + 2351, + 1743, + 1820, + 2001, + 802 + ], + [ + 2498, + 2251, + 1493, + 1628, + 1966, + 802 + ], + [ + 2426, + 2152, + 1272, + 1459, + 1947, + 802 + ], + [ + 2342, + 2056, + 1121, + 1377, + 1902, + 805 + ], + [ + 2268, + 1961, + 1038, + 1387, + 1839, + 829 + ], + [ + 2215, + 1865, + 1004, + 1452, + 1799, + 855 + ], + [ + 2174, + 1775, + 984, + 1531, + 1756, + 921 + ], + [ + 2122, + 1702, + 947, + 1551, + 1724, + 999 + ], + [ + 2058, + 1626, + 881, + 1542, + 1714, + 1020 + ], + [ + 2005, + 1525, + 783, + 1516, + 1717, + 1023 + ], + [ + 1960, + 1413, + 675, + 1483, + 1723, + 1038 + ], + [ + 1922, + 1301, + 576, + 1469, + 1719, + 1089 + ], + [ + 1898, + 1192, + 498, + 1477, + 1704, + 1139 + ], + [ + 1893, + 1102, + 438, + 1493, + 1681, + 1146 + ], + [ + 1893, + 1054, + 399, + 1497, + 1662, + 1146 + ], + [ + 1893, + 1051, + 393, + 1482, + 1657, + 1146 + ], + [ + 1893, + 1064, + 395, + 1422, + 1656, + 1145 + ], + [ + 1893, + 1118, + 398, + 1343, + 1647, + 1145 + ], + [ + 1893, + 1204, + 410, + 1242, + 1649, + 1139 + ], + [ + 1893, + 1301, + 480, + 1214, + 1661, + 1127 + ], + [ + 1893, + 1395, + 603, + 1250, + 1685, + 1124 + ], + [ + 1893, + 1497, + 755, + 1317, + 1717, + 1111 + ], + [ + 1894, + 1609, + 918, + 1377, + 1757, + 1107 + ], + [ + 1897, + 1726, + 1084, + 1418, + 1803, + 1107 + ], + [ + 1897, + 1840, + 1252, + 1464, + 1850, + 1107 + ], + [ + 1897, + 1928, + 1412, + 1538, + 1895, + 1107 + ], + [ + 1897, + 1989, + 1559, + 1636, + 1939, + 1107 + ], + [ + 1897, + 2025, + 1690, + 1740, + 1976, + 1107 + ], + [ + 1898, + 2050, + 1799, + 1839, + 2003, + 1107 + ], + [ + 1898, + 2057, + 1868, + 1944, + 2019, + 1107 + ], + [ + 1898, + 2053, + 1887, + 2032, + 2055, + 1107 + ], + [ + 1898, + 2052, + 1884, + 2061, + 2086, + 1107 + ], + [ + 1898, + 2056, + 1883, + 2126, + 2135, + 1107 + ], + [ + 1898, + 2057, + 1883, + 2207, + 2155, + 1107 + ], + [ + 1898, + 2053, + 1886, + 2218, + 2152, + 1107 + ], + [ + 1898, + 2052, + 1886, + 2218, + 2154, + 1107 + ], + [ + 1898, + 2050, + 1887, + 2175, + 2147, + 1107 + ], + [ + 1898, + 2055, + 1883, + 2081, + 2117, + 1107 + ], + [ + 1897, + 2057, + 1882, + 2072, + 2107, + 1107 + ] +] \ No newline at end of file diff --git a/temp/speech_record.wav b/temp/speech_record.wav new file mode 100644 index 0000000..861f32a Binary files /dev/null and b/temp/speech_record.wav differ diff --git a/temp/tts.wav b/temp/tts.wav new file mode 100644 index 0000000..51b37d3 Binary files /dev/null and b/temp/tts.wav differ diff --git a/temp/vl_now.jpg b/temp/vl_now.jpg new file mode 100644 index 0000000..d2d54e1 Binary files /dev/null and b/temp/vl_now.jpg differ diff --git a/temp/vl_now_viz.jpg b/temp/vl_now_viz.jpg new file mode 100644 index 0000000..22ae7b9 Binary files /dev/null and b/temp/vl_now_viz.jpg differ diff --git a/utils_agent.py b/utils_agent.py new file mode 100644 index 0000000..fbee592 --- /dev/null +++ b/utils_agent.py @@ -0,0 +1,75 @@ +# utils_agent.py +# 同济子豪兄 2024-5-23 +# Agent智能体相关函数 + +from utils_llm import * + +AGENT_SYS_PROMPT = ''' +你是我的机械臂助手,机械臂内置了一些函数,请你根据我的指令,以json形式输出要运行的对应函数和你给我的回复 + +【以下是所有内置函数介绍】 +机械臂位置归零,所有关节回到原点:back_zero() +放松机械臂,所有关节都可以自由手动拖拽活动:relax_arms() +做出摇头动作:head_shake() +做出点头动作:head_nod() +做出跳舞动作:head_dance() +打开吸泵:pump_on() +关闭吸泵:pump_off() +移动到指定XY坐标,比如移动到X坐标150,Y坐标-120:move_to_coords(X=150, Y=-120) +指定关节旋转,比如关节1旋转到60度,总共有6个关节:single_joint_move(1, 60) +移动至俯视姿态:move_to_top_view() +拍一张俯视图:top_view_shot() +开启摄像头,在屏幕上实时显示摄像头拍摄的画面:check_camera() +LED灯改变颜色,比如:llm_led('帮我把LED灯的颜色改为贝加尔湖的颜色') +将一个物体移动到另一个物体的位置上,比如:vlm_move('帮我把红色方块放在小猪佩奇上') +拖动示教,我可以拽着机械臂运动,然后机械臂模仿复现出一样的动作:drag_teach() +休息等待,比如等待两秒:time.sleep(2) + +【输出json格式】 +你直接输出json即可,从{开始,不要输出包含```json的开头或结尾 +在'function'键中,输出函数名列表,列表中每个元素都是字符串,代表要运行的函数名称和参数。每个函数既可以单独运行,也可以和其他函数先后运行。列表元素的先后顺序,表示执行函数的先后顺序 +在'response'键中,根据我的指令和你编排的动作,以第一人称输出你回复我的话,不要超过20个字,可以幽默和发散,用上歌词、台词、互联网热梗、名场面。比如李云龙的台词、甄嬛传的台词、练习时长两年半。 + +【以下是一些具体的例子】 +我的指令:回到原点。你输出:{'function':['back_zero()'], 'response':'回家吧,回到最初的美好'} +我的指令:先回到原点,然后跳舞。你输出:{'function':['back_zero()', 'head_dance()'], 'response':'我的舞姿,练习时长两年半'} +我的指令:先回到原点,然后移动到180, -90坐标。你输出:{'function':['back_zero()', 'move_to_coords(X=180, Y=-90)'], 'response':'精准不,老子打的就是精锐'} +我的指令:先打开吸泵,再把关节2旋转到30度。你输出:{'function':['pump_on()', single_joint_move(2, 30)], 'response':'你之前做的指星笔,就是通过关节2调俯仰角'} +我的指令:移动到X为160,Y为-30的地方。你输出:{'function':['move_to_coords(X=160, Y=-30)'], 'response':'坐标移动已完成'} +我的指令:拍一张俯视图,然后把LED灯的颜色改为黄金的颜色。你输出:{'function':['top_view_shot()', llm_led('把LED灯的颜色改为黄金的颜色')], 'response':'人工智能未来比黄金值钱,你信不信'} +我的指令:帮我把绿色方块放在小猪佩奇上面。你输出:{'function':[vlm_move('帮我把绿色方块放在小猪佩奇上面')], 'response':'它的弟弟乔治呢?'} +我的指令:帮我把红色方块放在李云龙的脸上。你输出:{'function':[vlm_move('帮我把红色方块放在李云龙的脸上')], 'response':'你他娘的真是个天才'} +我的指令:关闭吸泵,打开摄像头。你输出:{'function':[pump_off(), check_camera()], 'response':'你是我的眼,带我阅读浩瀚的书海'} +我的指令:先归零,再把LED灯的颜色改为墨绿色。你输出:{'function':[back_zero(), llm_led('把LED灯的颜色改为墨绿色')], 'response':'这种墨绿色,很像蜀南竹海的竹子'} +我的指令:我拽着你运动,然后你模仿复现出这个运动。你输出:{'function':['drag_teach()'], 'response':'你有本事拽一个鸡你太美'} +我的指令:开启拖动示教。你输出:{'function':['drag_teach()'], 'response':'你要我模仿我自己?'} +我的指令:先回到原点,等待三秒,再打开吸泵,把LED灯的颜色改成中国红,最后把绿色方块移动到摩托车上。你输出:{'function':['back_zero()', 'time.sleep(3)', 'pump_on()', llm_led('把LED灯的颜色改为中国红色', vlm_move('把绿色方块移动到摩托车上'))], 'response':'如果奇迹有颜色,那一定是中国红'} + +【一些李云龙相关的台词,如果和李云龙相关,可以在response中提及对应的台词】 +学习?学个屁 +给你半斤地瓜烧 +老子打的就是精锐 +二营长,你的意大利炮呢 +你他娘的真是个天才 +咱老李也是十里八乡的俊后生 +不报此仇,我李云龙誓不为人 +你猜旅长怎么说 +逢敌必亮剑,绝不含糊! +老子当初怎么教他打枪,现在就教他怎么打仗! +你咋就不敢跟旅长干一架呢? +你猪八戒戴眼镜充什么大学生啊? +我李云龙八岁习武,南拳北腿略知一二。 +死,也要死在冲锋的路上! + + +【一些小猪佩奇相关的台词】 +这是我的弟弟乔治 + +【我现在的指令是】 +''' + +def agent_plan(AGENT_PROMPT='先回到原点,再把LED灯改为墨绿色,然后把绿色方块放在篮球上'): + print('Agent智能体编排动作') + PROMPT = AGENT_SYS_PROMPT + AGENT_PROMPT + agent_plan = llm_yi(PROMPT) + return agent_plan diff --git a/utils_asr.py b/utils_asr.py new file mode 100644 index 0000000..e48cf45 --- /dev/null +++ b/utils_asr.py @@ -0,0 +1,150 @@ +# utils_asr.py +# 同济子豪兄 2024-5-22 +# 录音+语音识别 + +print('导入录音+语音识别模块') + +import pyaudio +import wave +import numpy as np +import os +import sys +from API_KEY import * + +# 确定麦克风索引号 +# import sounddevice as sd +# print(sd.query_devices()) + +def record(MIC_INDEX=0, DURATION=5): + ''' + 调用麦克风录音,需用arecord -l命令获取麦克风ID + DURATION,录音时长 + ''' + print('开始 {} 秒录音'.format(DURATION)) + os.system('sudo arecord -D "plughw:{}" -f dat -c 1 -r 16000 -d {} temp/speech_record.wav'.format(MIC_INDEX, DURATION)) + print('录音结束') + +def record_auto(MIC_INDEX=1): + ''' + 开启麦克风录音,保存至'temp/speech_record.wav'音频文件 + 音量超过阈值自动开始录音,低于阈值一段时间后自动停止录音 + MIC_INDEX:麦克风设备索引号 + ''' + + CHUNK = 1024 # 采样宽度 + RATE = 16000 # 采样率 + + QUIET_DB = 2000 # 分贝阈值,大于则开始录音,否则结束 + delay_time = 1 # 声音降至分贝阈值后,经过多长时间,自动终止录音 + + FORMAT = pyaudio.paInt16 + CHANNELS = 1 if sys.platform == 'darwin' else 2 # 采样通道数 + + # 初始化录音 + p = pyaudio.PyAudio() + stream = p.open(format=FORMAT, + channels=CHANNELS, + rate=RATE, + input=True, + frames_per_buffer=CHUNK, + input_device_index=MIC_INDEX + ) + + frames = [] # 所有音频帧 + + flag = False # 是否已经开始录音 + quiet_flag = False # 当前音量小于阈值 + + temp_time = 0 # 当前时间是第几帧 + last_ok_time = 0 # 最后正常是第几帧 + START_TIME = 0 # 开始录音是第几帧 + END_TIME = 0 # 结束录音是第几帧 + + print('可以说话啦!') + + while True: + + # 获取当前chunk的声音 + data = stream.read(CHUNK, exception_on_overflow=False) + frames.append(data) + # 获取当前chunk的音量分贝值 + temp_volume = np.max(np.frombuffer(data, dtype=np.short)) + + if temp_volume > QUIET_DB and flag==False: + print("音量高于阈值,开始录音") + flag =True + START_TIME = temp_time + last_ok_time = temp_time + + if flag: # 录音中的各种情况 + + if(temp_volume < QUIET_DB and quiet_flag==False): + print("录音中,当前音量低于阈值") + quiet_flag = True + last_ok_time = temp_time + + if(temp_volume > QUIET_DB): + # print('录音中,当前音量高于阈值,正常录音') + quiet_flag = False + last_ok_time = temp_time + + if(temp_time > last_ok_time + delay_time*15 and quiet_flag==True): + print("音量低于阈值{:.2f}秒后,检测当前音量".format(delay_time)) + if(quiet_flag and temp_volume < QUIET_DB): + print("当前音量仍然小于阈值,录音结束") + END_TIME = temp_time + break + else: + print("当前音量重新高于阈值,继续录音中") + quiet_flag = False + last_ok_time = temp_time + + # print('当前帧 {} 音量 {}'.format(temp_time+1, temp_volume)) + temp_time += 1 + if temp_time > 150: # 超时直接退出 + END_TIME = temp_time + print('超时,录音结束') + break + + # 停止录音 + stream.stop_stream() + stream.close() + p.terminate() + + # 导出wav音频文件 + output_path = 'temp/speech_record.wav' + wf = wave.open(output_path, 'wb') + wf.setnchannels(CHANNELS) + wf.setsampwidth(p.get_sample_size(FORMAT)) + wf.setframerate(RATE) + wf.writeframes(b''.join(frames[START_TIME-2:END_TIME])) + wf.close() + print('保存录音文件', output_path) + +import appbuilder +# 配置密钥 +os.environ["APPBUILDER_TOKEN"] = APPBUILDER_TOKEN +asr = appbuilder.ASR() # 语音识别组件 +def speech_recognition(audio_path='temp/speech_record.wav'): + ''' + AppBuilder-SDK语音识别组件 + ''' + print('开始语音识别') + # 载入wav音频文件 + with wave.open(audio_path, 'rb') as wav_file: + + # 获取音频文件的基本信息 + num_channels = wav_file.getnchannels() + sample_width = wav_file.getsampwidth() + framerate = wav_file.getframerate() + num_frames = wav_file.getnframes() + + # 获取音频数据 + frames = wav_file.readframes(num_frames) + + # 向API发起请求 + content_data = {"audio_format": "wav", "raw_audio": frames, "rate": 16000} + message = appbuilder.Message(content_data) + speech_result = asr.run(message).content['result'][0] + print('语音识别结果:', speech_result) + return speech_result \ No newline at end of file diff --git a/utils_camera.py b/utils_camera.py new file mode 100644 index 0000000..64ee43e --- /dev/null +++ b/utils_camera.py @@ -0,0 +1,25 @@ +# utils_camera.py +# 同济子豪兄 2024-5-22 +# 开启摄像头,调用摄像头实时画面,按q键退出 + +import cv2 +import numpy as np + +def check_camera(): + ''' + 开启摄像头,调用摄像头实时画面,按q键退出 + ''' + print('开启摄像头') + cap = cv2.VideoCapture(0) + + while(True): + ret, frame = cap.read() + + # gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) + + cv2.imshow('frame', frame) + if cv2.waitKey(1) & 0xFF == ord('q'): + break + + cap.release() + cv2.destroyAllWindows() \ No newline at end of file diff --git a/utils_drag_teaching.py b/utils_drag_teaching.py new file mode 100644 index 0000000..3ecc90b --- /dev/null +++ b/utils_drag_teaching.py @@ -0,0 +1,186 @@ +# utils_drag_teaching.py +# 同济子豪兄 2024-5-23 +# 拖动示教 + +print('导入拖动示教模块') + +import time +import os +import sys +import termios +import tty +import threading +import json + +from pymycobot.mycobot import MyCobot +from pymycobot import PI_PORT, PI_BAUD + +# 连接机械臂 +mc = MyCobot(PI_PORT, PI_BAUD, debug=False) + +class Raw(object): + """Set raw input mode for device""" + + def __init__(self, stream): + self.stream = stream + self.fd = self.stream.fileno() + + def __enter__(self): + self.original_stty = termios.tcgetattr(self.stream) + tty.setcbreak(self.stream) + + def __exit__(self, type, value, traceback): + termios.tcsetattr(self.stream, termios.TCSANOW, self.original_stty) + + +class Helper(object): + def __init__(self) -> None: + self.w, self.h = os.get_terminal_size() + + def echo(self, msg): + print("\r{}".format(" " * self.w), end="") + print("\r{}".format(msg), end="") + + +class TeachingTest(Helper): + def __init__(self, mycobot) -> None: + super().__init__() + self.mc = mycobot + self.recording = False + self.playing = False + self.record_list = [] + self.record_t = None + self.play_t = None + + def record(self): + self.record_list = [] + self.recording = True + self.mc.set_fresh_mode(0) + def _record(): + start_t = time.time() + + while self.recording: + angles = self.mc.get_encoders() + if angles: + self.record_list.append(angles) + time.sleep(0.1) + print("\r {}".format(time.time() - start_t), end="") + + self.echo("开始录制动作") + self.record_t = threading.Thread(target=_record, daemon=True) + self.record_t.start() + + def stop_record(self): + if self.recording: + self.recording = False + self.record_t.join() + self.echo("停止录制动作") + + def play(self): + self.echo("开始回放动作") + for angles in self.record_list: + # print(angles) + self.mc.set_encoders(angles, 80) + time.sleep(0.1) + self.echo("回放结束\n") + + def loop_play(self): + self.playing = True + + def _loop(): + len_ = len(self.record_list) + i = 0 + while self.playing: + idx_ = i % len_ + i += 1 + self.mc.set_encoders(self.record_list[idx_], 80) + time.sleep(0.1) + + self.echo("开始循环回放") + self.play_t = threading.Thread(target=_loop, daemon=True) + self.play_t.start() + + def stop_loop_play(self): + if self.playing: + self.playing = False + self.play_t.join() + self.echo("停止循环回放") + + def save_to_local(self): + if not self.record_list: + self.echo("No data should save.") + return + + save_path = os.path.dirname(__file__) + "/temp/record.txt" + with open(save_path, "w") as f: + json.dump(self.record_list, f, indent=2) + self.echo("回放动作导出至: {}".format(save_path)) + + def load_from_local(self): + + with open(os.path.dirname(__file__) + "/temp/record.txt", "r") as f: + try: + data = json.load(f) + self.record_list = data + self.echo("载入本地动作数据成功") + except Exception: + self.echo("Error: invalid data.") + + def print_menu(self): + print( + """\ + \r 拖动示教 同济子豪兄 + \r q: 退出 + \r r: 开始录制动作 + \r c: 停止录制动作 + \r p: 回放动作 + \r P: 循环回放/停止循环回放 + \r s: 将录制的动作保存到本地 + \r l: 从本地读取录制好的动作 + \r f: 放松机械臂 + \r---------------------------------- + """ + ) + + def start(self): + self.print_menu() + + while not False: + with Raw(sys.stdin): + key = sys.stdin.read(1) + if key == "q": + break + elif key == "r": # recorder + self.record() + elif key == "c": # stop recorder + self.stop_record() + elif key == "p": # play + self.play() + elif key == "P": # loop play + if not self.playing: + self.loop_play() + else: + self.stop_loop_play() + elif key == "s": # save to local + self.save_to_local() + elif key == "l": # load from local + self.load_from_local() + elif key == "f": # free move + self.mc.release_all_servos() + self.echo("Released") + else: + print(key) + continue + +def drag_teach(): + + print('机械臂归零') + mc.send_angles([0, 0, 0, 0, 0, 0], 40) + time.sleep(3) + + recorder = TeachingTest(mc) + recorder.start() + + print('机械臂归零') + mc.send_angles([0, 0, 0, 0, 0, 0], 40) + time.sleep(3) diff --git a/utils_led.py b/utils_led.py new file mode 100644 index 0000000..1bd43d9 --- /dev/null +++ b/utils_led.py @@ -0,0 +1,41 @@ +# utils_led.py +# 同济子豪兄 2024-5-22 +# 大模型控制LED灯颜色 + +from utils_llm import llm_qianfan, llm_yi +from utils_robot import mc + +print('导入LED灯控制模块') + +# 备选颜色 +# 贝加尔湖、中国红、大海、绿叶、金子、蓝宝石、小猪佩奇、墨绿色、黑色 + +# 系统提示词 +SYS_PROMPT = '我即将说的这句话中包含一个目标物体,帮我把这个物体的一种可能的颜色,以0-255的RGB像素值形式返回给我,整理成元组格式,例如(255, 30, 60),直接回复元组本身,以括号开头,不要回复任何中文内容,下面是这句话:' + +def llm_led(PROMPT_LED='帮我把LED灯的颜色改为贝加尔湖的颜色'): + ''' + 大模型控制LED灯颜色 + ''' + + PROMPT = SYS_PROMPT + PROMPT_LED + + n = 1 + while n < 5: + try: + # 调用大模型API + # response = llm_qianfan(PROMPT) + response = llm_yi(PROMPT) + + # 提取颜色 + rgb_tuple = eval(response) + + # 设置LED灯的RGB颜色 + mc.set_color(rgb_tuple[0], rgb_tuple[1], rgb_tuple[2]) + print('LED灯颜色修改成功', rgb_tuple) + + break + + except Exception as e: + print('大模型返回json结构错误,再尝试一次', e) + n += 1 \ No newline at end of file diff --git a/utils_llm.py b/utils_llm.py new file mode 100644 index 0000000..6809848 --- /dev/null +++ b/utils_llm.py @@ -0,0 +1,59 @@ +# utils_llm.py +# 同济子豪兄 2024-5-22 +# 调用大语言模型API + +print('导入大模型API模块') + + +import os + +import qianfan +def llm_qianfan(PROMPT='你好,你是谁?'): + ''' + 百度智能云千帆大模型平台API + ''' + + # 传入 ACCESS_KEY 和 SECRET_KEY + os.environ["QIANFAN_ACCESS_KEY"] = QIANFAN_ACCESS_KEY + os.environ["QIANFAN_SECRET_KEY"] = QIANFAN_SECRET_KEY + + # 选择大语言模型 + MODEL = "ERNIE-Bot-4" + # MODEL = "ERNIE Speed" + # MODEL = "ERNIE-Lite-8K" + # MODEL = 'ERNIE-Tiny-8K' + + chat_comp = qianfan.ChatCompletion(model=MODEL) + + # 输入给大模型 + resp = chat_comp.do( + messages=[{"role": "user", "content": PROMPT}], + top_p=0.8, + temperature=0.3, + penalty_score=1.0 + ) + + response = resp["result"] + return response + +import openai +from openai import OpenAI +from API_KEY import * +def llm_yi(PROMPT='你好,你是谁?'): + ''' + 零一万物大模型API + ''' + + API_BASE = "https://api.lingyiwanwu.com/v1" + API_KEY = YI_KEY + + MODEL = 'yi-large' + # MODEL = 'yi-medium' + # MODEL = 'yi-spark' + + # 访问大模型API + client = OpenAI(api_key=API_KEY, base_url=API_BASE) + completion = client.chat.completions.create(model=MODEL, messages=[{"role": "user", "content": PROMPT}]) + result = completion.choices[0].message.content.strip() + return result + diff --git a/utils_pump.py b/utils_pump.py new file mode 100644 index 0000000..b1287da --- /dev/null +++ b/utils_pump.py @@ -0,0 +1,37 @@ +# utils_pump.py +# 同济子豪兄 2024-5-22 +# GPIO引脚、吸泵相关函数 + +print('导入吸泵控制模块') +import RPi.GPIO as GPIO +import time + +# 初始化GPIO +GPIO.setwarnings(False) # 不打印 warning 信息 +GPIO.setmode(GPIO.BCM) +GPIO.setup(20, GPIO.OUT) +GPIO.setup(21, GPIO.OUT) +GPIO.output(20, 1) # 关闭吸泵电磁阀 + +def pump_on(): + ''' + 开启吸泵 + ''' + print(' 开启吸泵') + GPIO.output(20, 0) + +def pump_off(): + ''' + 关闭吸泵,吸泵放气,释放物体 + ''' + print(' 关闭吸泵') + GPIO.output(20, 1) # 关闭吸泵电磁阀 + time.sleep(0.05) + GPIO.output(21, 0) # 打开泄气阀门 + time.sleep(0.2) + GPIO.output(21, 1) + time.sleep(0.05) + GPIO.output(21, 0) # 再一次泄气,确保物体释放 + time.sleep(0.2) + GPIO.output(21, 1) + time.sleep(0.05) \ No newline at end of file diff --git a/utils_robot.py b/utils_robot.py new file mode 100644 index 0000000..df5bc3b --- /dev/null +++ b/utils_robot.py @@ -0,0 +1,222 @@ +# utils_robot.py +# 同济子豪兄 2024-5-22 +# 启动并连接机械臂,导入各种工具包 + +print('导入机械臂连接模块') + +from pymycobot.mycobot import MyCobot +from pymycobot import PI_PORT, PI_BAUD +import cv2 +import numpy as np +import time +from utils_pump import * + +# 连接机械臂 +mc = MyCobot(PI_PORT, PI_BAUD) +# 设置运动模式为插补 +mc.set_fresh_mode(0) + +import RPi.GPIO as GPIO +# 初始化GPIO +GPIO.setwarnings(False) # 不打印 warning 信息 +GPIO.setmode(GPIO.BCM) +GPIO.setup(20, GPIO.OUT) +GPIO.setup(21, GPIO.OUT) +GPIO.output(20, 1) # 关闭吸泵电磁阀 + +def back_zero(): + ''' + 机械臂归零 + ''' + print('机械臂归零') + mc.send_angles([0, 0, 0, 0, 0, 0], 40) + time.sleep(3) + +def relax_arms(): + print('放松机械臂关节') + mc.release_all_servos() + +def head_shake(): + # 左右摆头 + mc.send_angles([0.87,(-50.44),47.28,0.35,(-0.43),(-0.26)],70) + time.sleep(1) + for count in range(2): + mc.send_angle(5, 30, 80) + time.sleep(0.5) + mc.send_angle(5, -30,80) + time.sleep(0.5) + # mc.send_angles([0.87,(-50.44),47.28,0.35,(-0.43),(-0.26)],70) + # time.sleep(1) + mc.send_angles([0, 0, 0, 0, 0, 0], 40) + time.sleep(2) + +def head_dance(): + # 跳舞 + mc.send_angles([0.87,(-50.44),47.28,0.35,(-0.43),(-0.26)],70) + time.sleep(1) + for count in range(1): + mc.send_angles([(-0.17),(-94.3),118.91,(-39.9),59.32,(-0.52)],80) + time.sleep(1.2) + mc.send_angles([67.85,(-3.42),(-116.98),106.52,23.11,(-0.52)],80) + time.sleep(1.7) + mc.send_angles([(-38.14),(-115.04),116.63,69.69,3.25,(-11.6)],80) + time.sleep(1.7) + mc.send_angles([2.72,(-26.19),140.27,(-110.74),(-6.15),(-11.25)],80) + time.sleep(1) + mc.send_angles([0,0,0,0,0,0],80) + +def head_nod(): + # 点头 + mc.send_angles([0.87,(-50.44),47.28,0.35,(-0.43),(-0.26)],70) + for count in range(2): + mc.send_angle(4, 13, 70) + time.sleep(0.5) + mc.send_angle(4, -20, 70) + time.sleep(1) + mc.send_angle(4,13,70) + time.sleep(0.5) + mc.send_angles([0.87,(-50.44),47.28,0.35,(-0.43),(-0.26)],70) + +def move_to_coords(X=150, Y=-130, HEIGHT_SAFE=230): + print('移动至指定坐标:X {} Y {}'.format(X, Y)) + mc.send_coords([X, Y, HEIGHT_SAFE, 0, 180, 90], 20, 0) + time.sleep(4) + +def single_joint_move(joint_index, angle): + print('关节 {} 旋转至 {} 度'.format(joint_index, angle)) + mc.send_angle(joint_index, angle, 40) + time.sleep(2) + +def move_to_top_view(): + print('移动至俯视姿态') + mc.send_angles([-62.13, 8.96, -87.71, -14.41, 2.54, -16.34], 10) + time.sleep(3) + +def top_view_shot(check=False): + ''' + 拍摄一张图片并保存 + check:是否需要人工看屏幕确认拍照成功,再在键盘上按q键确认继续 + ''' + print(' 移动至俯视姿态') + move_to_top_view() + + # 获取摄像头,传入0表示获取系统默认摄像头 + cap = cv2.VideoCapture(0) + # 打开cap + cap.open(0) + time.sleep(0.3) + success, img_bgr = cap.read() + + # 保存图像 + print(' 保存至temp/vl_now.jpg') + cv2.imwrite('temp/vl_now.jpg', img_bgr) + + # 屏幕上展示图像 + cv2.destroyAllWindows() # 关闭所有opencv窗口 + cv2.imshow('zihao_vlm', img_bgr) + + if check: + print('请确认拍照成功,按c键继续,按q键退出') + while(True): + key = cv2.waitKey(10) & 0xFF + if key == ord('c'): # 按c键继续 + break + if key == ord('q'): # 按q键退出 + # exit() + cv2.destroyAllWindows() # 关闭所有opencv窗口 + raise NameError('按q退出') + else: + if cv2.waitKey(10) & 0xFF == None: + pass + + # 关闭摄像头 + cap.release() + # 关闭图像窗口 + # cv2.destroyAllWindows() + +def eye2hand(X_im=160, Y_im=120): + ''' + 输入目标点在图像中的像素坐标,转换为机械臂坐标 + ''' + + # 整理两个标定点的坐标 + cali_1_im = [130, 290] # 左下角,第一个标定点的像素坐标,要手动填! + cali_1_mc = [-21.8, -197.4] # 左下角,第一个标定点的机械臂坐标,要手动填! + cali_2_im = [640, 0] # 右上角,第二个标定点的像素坐标 + cali_2_mc = [215, -59.1] # 右上角,第二个标定点的机械臂坐标,要手动填! + + X_cali_im = [cali_1_im[0], cali_2_im[0]] # 像素坐标 + X_cali_mc = [cali_1_mc[0], cali_2_mc[0]] # 机械臂坐标 + Y_cali_im = [cali_2_im[1], cali_1_im[1]] # 像素坐标,先小后大 + Y_cali_mc = [cali_2_mc[1], cali_1_mc[1]] # 机械臂坐标,先大后小 + + # X差值 + X_mc = int(np.interp(X_im, X_cali_im, X_cali_mc)) + + # Y差值 + Y_mc = int(np.interp(Y_im, Y_cali_im, Y_cali_mc)) + + return X_mc, Y_mc + +# 吸泵吸取并移动物体 +def pump_move(mc, XY_START=[230,-50], HEIGHT_START=90, XY_END=[100,220], HEIGHT_END=100, HEIGHT_SAFE=220): + + ''' + 用吸泵,将物体从起点吸取移动至终点 + + mc:机械臂实例 + XY_START:起点机械臂坐标 + HEIGHT_START:起点高度,方块用90,药盒子用70 + XY_END:终点机械臂坐标 + HEIGHT_END:终点高度 + HEIGHT_SAFE:搬运途中安全高度 + ''' + + # 初始化GPIO + GPIO.setmode(GPIO.BCM) + GPIO.setup(20, GPIO.OUT) + GPIO.setup(21, GPIO.OUT) + + # 设置运动模式为插补 + mc.set_fresh_mode(0) + + # # 机械臂归零 + # print(' 机械臂归零') + # mc.send_angles([0, 0, 0, 0, 0, 0], 40) + # time.sleep(4) + + # 吸泵移动至物体上方 + print(' 吸泵移动至物体上方') + mc.send_coords([XY_START[0], XY_START[1], HEIGHT_SAFE, 0, 180, 90], 20, 0) + time.sleep(4) + + # 开启吸泵 + pump_on() + + # 吸泵向下吸取物体 + print(' 吸泵向下吸取物体') + mc.send_coords([XY_START[0], XY_START[1], HEIGHT_START, 0, 180, 90], 15, 0) + time.sleep(4) + + # 升起物体 + print(' 升起物体') + mc.send_coords([XY_START[0], XY_START[1], HEIGHT_SAFE, 0, 180, 90], 15, 0) + time.sleep(4) + + # 搬运物体至目标上方 + print(' 搬运物体至目标上方') + mc.send_coords([XY_END[0], XY_END[1], HEIGHT_SAFE, 0, 180, 90], 15, 0) + time.sleep(4) + + # 向下放下物体 + print(' 向下放下物体') + mc.send_coords([XY_END[0], XY_END[1], HEIGHT_END, 0, 180, 90], 20, 0) + time.sleep(3) + + # 关闭吸泵 + pump_off() + + # 机械臂归零 + print(' 机械臂归零') + mc.send_angles([0, 0, 0, 0, 0, 0], 40) + time.sleep(3) diff --git a/utils_tts.py b/utils_tts.py new file mode 100644 index 0000000..b5bbe3d --- /dev/null +++ b/utils_tts.py @@ -0,0 +1,60 @@ +# utils_tts.py +# 同济子豪兄 2024-5-23 +# 语音合成 + +print('导入语音合成模块') + +import os +import appbuilder +from API_KEY import * +import pyaudio +import wave + +tts_ab = appbuilder.TTS() + +def tts(TEXT='我是同济子豪兄的麒麟臂', tts_wav_path = 'temp/tts.wav'): + ''' + 语音合成TTS,生成wav音频文件 + ''' + inp = appbuilder.Message(content={"text": TEXT}) + out = tts_ab.run(inp, model="paddlespeech-tts", audio_type="wav") + # out = tts_ab.run(inp, audio_type="wav") + with open(tts_wav_path, "wb") as f: + f.write(out.content["audio_binary"]) + # print("TTS语音合成,导出wav音频文件至:{}".format(tts_wav_path)) + +def play_wav(wav_file='asset/welcome.wav'): + ''' + 播放wav音频文件 + ''' + prompt = 'aplay -t wav {} -q'.format(wav_file) + os.system(prompt) + +# def play_wav(wav_file='temp/tts.wav'): +# ''' +# 播放wav文件 +# ''' +# wf = wave.open(wav_file, 'rb') + +# # 实例化PyAudio +# p = pyaudio.PyAudio() + +# # 打开流 +# stream = p.open(format=p.get_format_from_width(wf.getsampwidth()), +# channels=wf.getnchannels(), +# rate=wf.getframerate(), +# output=True) + +# chunk_size = 1024 +# # 读取数据 +# data = wf.readframes(chunk_size) + +# # 播放音频 +# while data != b'': +# stream.write(data) +# data = wf.readframes(chunk_size) + +# # 停止流,关闭流和PyAudio +# stream.stop_stream() +# stream.close() +# p.terminate() \ No newline at end of file diff --git a/utils_vlm.py b/utils_vlm.py new file mode 100644 index 0000000..1621531 --- /dev/null +++ b/utils_vlm.py @@ -0,0 +1,158 @@ +# utils_vlm.py +# 同济子豪兄 2024-5-22 +# 多模态大模型、可视化 + +print('导入视觉大模型模块') +import time +import cv2 +import numpy as np +from PIL import Image +from PIL import ImageFont, ImageDraw +# 导入中文字体,指定字号 +font = ImageFont.truetype('asset/SimHei.ttf', 26) + +from API_KEY import * + +# 系统提示词 +SYSTEM_PROMPT = ''' +我即将说一句给机械臂的指令,你帮我从这句话中提取出起始物体和终止物体,并从这张图中分别找到这两个物体左上角和右下角的像素坐标,输出json数据结构。 + +例如,如果我的指令是:请帮我把红色方块放在房子简笔画上。 +你输出这样的格式: +{ + "start":"红色方块", + "start_xyxy":[[102,505],[324,860]], + "end":"房子简笔画", + "end_xyxy":[[300,150],[476,310]] +} + +只回复json本身即可,不要回复其它内容 + +我现在的指令是: +''' + +# Yi-Vision调用函数 +import openai +from openai import OpenAI +import base64 +def yi_vision_api(PROMPT='帮我把红色方块放在钢笔上', img_path='temp/vl_now.jpg'): + + ''' + 零一万物大模型开放平台,yi-vision视觉语言多模态大模型API + ''' + + client = OpenAI( + api_key=YI_KEY, + base_url="https://api.lingyiwanwu.com/v1" + ) + + # 编码为base64数据 + with open(img_path, 'rb') as image_file: + image = 'data:image/jpeg;base64,' + base64.b64encode(image_file.read()).decode('utf-8') + + # 向大模型发起请求 + completion = client.chat.completions.create( + model="yi-vision", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": SYSTEM_PROMPT + PROMPT + }, + { + "type": "image_url", + "image_url": { + "url": image + } + } + ] + }, + ] + ) + + # 解析大模型返回结果 + result = eval(completion.choices[0].message.content.strip()) + print(' 大模型调用成功!') + + return result + +def post_processing_viz(result, img_path, check=False): + + ''' + 视觉大模型输出结果后处理和可视化 + check:是否需要人工看屏幕确认可视化成功,按键继续或退出 + ''' + + # 后处理 + img_bgr = cv2.imread(img_path) + img_h = img_bgr.shape[0] + img_w = img_bgr.shape[1] + # 缩放因子 + FACTOR = 999 + # 起点物体名称 + START_NAME = result['start'] + # 终点物体名称 + END_NAME = result['end'] + # 起点,左上角像素坐标 + START_X_MIN = int(result['start_xyxy'][0][0] * img_w / FACTOR) + START_Y_MIN = int(result['start_xyxy'][0][1] * img_h / FACTOR) + # 起点,右下角像素坐标 + START_X_MAX = int(result['start_xyxy'][1][0] * img_w / FACTOR) + START_Y_MAX = int(result['start_xyxy'][1][1] * img_h / FACTOR) + # 起点,中心点像素坐标 + START_X_CENTER = int((START_X_MIN + START_X_MAX) / 2) + START_Y_CENTER = int((START_Y_MIN + START_Y_MAX) / 2) + # 终点,左上角像素坐标 + END_X_MIN = int(result['end_xyxy'][0][0] * img_w / FACTOR) + END_Y_MIN = int(result['end_xyxy'][0][1] * img_h / FACTOR) + # 终点,右下角像素坐标 + END_X_MAX = int(result['end_xyxy'][1][0] * img_w / FACTOR) + END_Y_MAX = int(result['end_xyxy'][1][1] * img_h / FACTOR) + # 终点,中心点像素坐标 + END_X_CENTER = int((END_X_MIN + END_X_MAX) / 2) + END_Y_CENTER = int((END_Y_MIN + END_Y_MAX) / 2) + + # 可视化 + # 画起点物体框 + img_bgr = cv2.rectangle(img_bgr, (START_X_MIN, START_Y_MIN), (START_X_MAX, START_Y_MAX), [0, 0, 255], thickness=3) + # 画起点中心点 + img_bgr = cv2.circle(img_bgr, [START_X_CENTER, START_Y_CENTER], 6, [0, 0, 255], thickness=-1) + # 画终点物体框 + img_bgr = cv2.rectangle(img_bgr, (END_X_MIN, END_Y_MIN), (END_X_MAX, END_Y_MAX), [255, 0, 0], thickness=3) + # 画终点中心点 + img_bgr = cv2.circle(img_bgr, [END_X_CENTER, END_Y_CENTER], 6, [255, 0, 0], thickness=-1) + # 写中文物体名称 + img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB) # BGR 转 RGB + img_pil = Image.fromarray(img_rgb) # array 转 pil + draw = ImageDraw.Draw(img_pil) + # 写起点物体中文名称 + draw.text((START_X_MIN, START_Y_MIN-32), START_NAME, font=font, fill=(255, 0, 0, 1)) # 文字坐标,中文字符串,字体,rgba颜色 + # 写终点物体中文名称 + draw.text((END_X_MIN, END_Y_MIN-32), END_NAME, font=font, fill=(0, 0, 255, 1)) # 文字坐标,中文字符串,字体,rgba颜色 + img_bgr = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR) # RGB转BGR + # 保存可视化效果图 + cv2.imwrite('temp/vl_now_viz.jpg', img_bgr) + + formatted_time = time.strftime("%Y%m%d%H%M", time.localtime()) + cv2.imwrite('visualizations/{}.jpg'.format(formatted_time), img_bgr) + + # 在屏幕上展示可视化效果图 + cv2.imshow('zihao_vlm', img_bgr) + + if check: + print(' 请确认可视化成功,按c键继续,按q键退出') + while(True): + key = cv2.waitKey(10) & 0xFF + if key == ord('c'): # 按c键继续 + break + if key == ord('q'): # 按q键退出 + # exit() + cv2.destroyAllWindows() # 关闭所有opencv窗口 + raise NameError('按q退出') + else: + if cv2.waitKey(1) & 0xFF == None: + pass + + return START_X_CENTER, START_Y_CENTER, END_X_CENTER, END_Y_CENTER diff --git a/utils_vlm_move.py b/utils_vlm_move.py new file mode 100644 index 0000000..fe83af8 --- /dev/null +++ b/utils_vlm_move.py @@ -0,0 +1,84 @@ +# utils_vlm_move.py +# 同济子豪兄 2024-5-22 +# 输入指令,多模态大模型识别图像,吸泵吸取并移动物体 + +# print('神行太保:能看懂“图像”、听懂“人话”的机械臂') + +from utils_robot import * +from utils_asr import * +from utils_vlm import * + +import time + +def vlm_move(PROMPT='帮我把绿色方块放在小猪佩奇上', input_way='keyboard'): + ''' + 多模态大模型识别图像,吸泵吸取并移动物体 + input_way:speech语音输入,keyboard键盘输入 + ''' + + print('多模态大模型识别图像,吸泵吸取并移动物体') + + # 机械臂归零 + print('机械臂归零') + mc.send_angles([0, 0, 0, 0, 0, 0], 50) + time.sleep(3) + + ## 第一步:完成手眼标定 + print('第一步:完成手眼标定') + + ## 第二步:发出指令 + # PROMPT_BACKUP = '帮我把绿色方块放在小猪佩奇上' # 默认指令 + + # if input_way == 'keyboard': + # PROMPT = input('第二步:输入指令') + # if PROMPT == '': + # PROMPT = PROMPT_BACKUP + # elif input_way == 'speech': + # record() # 录音 + # PROMPT = speech_recognition() # 语音识别 + print('第二步,给出的指令是:', PROMPT) + + ## 第三步:拍摄俯视图 + print('第三步:拍摄俯视图') + top_view_shot(check=False) + + ## 第四步:将图片输入给多模态视觉大模型 + print('第四步:将图片输入给多模态视觉大模型') + img_path = 'temp/vl_now.jpg' + + n = 1 + while n < 5: + try: + print(' 尝试第 {} 次访问多模态大模型'.format(n)) + result = yi_vision_api(PROMPT, img_path='temp/vl_now.jpg') + print(' 多模态大模型调用成功!') + print(result) + break + except Exception as e: + print(' 多模态大模型返回数据结构错误,再尝试一次', e) + n += 1 + + ## 第五步:视觉大模型输出结果后处理和可视化 + print('第五步:视觉大模型输出结果后处理和可视化') + START_X_CENTER, START_Y_CENTER, END_X_CENTER, END_Y_CENTER = post_processing_viz(result, img_path, check=True) + + ## 第六步:手眼标定转换为机械臂坐标 + print('第六步:手眼标定,将像素坐标转换为机械臂坐标') + # 起点,机械臂坐标 + START_X_MC, START_Y_MC = eye2hand(START_X_CENTER, START_Y_CENTER) + # 终点,机械臂坐标 + END_X_MC, END_Y_MC = eye2hand(END_X_CENTER, END_Y_CENTER) + + ## 第七步:吸泵吸取移动物体 + print('第七步:吸泵吸取移动物体') + pump_move(mc=mc, XY_START=[START_X_MC, START_Y_MC], XY_END=[END_X_MC, END_Y_MC]) + + ## 第八步:收尾 + print('第八步:任务完成') + GPIO.cleanup() # 释放GPIO pin channel + cv2.destroyAllWindows() # 关闭所有opencv窗口 + # exit() + + + + diff --git a/visualizations/202407242109.jpg b/visualizations/202407242109.jpg new file mode 100644 index 0000000..7a2f8a8 Binary files /dev/null and b/visualizations/202407242109.jpg differ diff --git a/visualizations/202407242204.jpg b/visualizations/202407242204.jpg new file mode 100644 index 0000000..31b5112 Binary files /dev/null and b/visualizations/202407242204.jpg differ diff --git a/visualizations/202407242245.jpg b/visualizations/202407242245.jpg new file mode 100644 index 0000000..3720ef4 Binary files /dev/null and b/visualizations/202407242245.jpg differ