From 6c3e2d44afe418d02c14608d20991f6b22c4d958 Mon Sep 17 00:00:00 2001 From: SimolZimol <70102430+SimolZimol@users.noreply.github.com> Date: Fri, 29 Aug 2025 17:27:44 +0200 Subject: [PATCH] modified: bot.py --- bot.py | 295 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 295 insertions(+) diff --git a/bot.py b/bot.py index 2c1c458..1572729 100644 --- a/bot.py +++ b/bot.py @@ -21,6 +21,7 @@ import hashlib from datetime import datetime, timedelta import concurrent.futures from gtts import gTTS +import speech_recognition as sr import shutil from bs4 import BeautifulSoup from dotenv import load_dotenv @@ -2506,6 +2507,300 @@ async def leave(ctx): else: await ctx.send("I am not in a voice channel.") +@client.hybrid_command() +async def speech_to_text(ctx, *, audio_attachment: discord.Attachment = None): + """Converts audio file to text using speech recognition. + + Usage: + /speech_to_text [attach audio file] + + Supported formats: .wav, .mp3, .m4a, .ogg, .flac + """ + # Check if it's a slash command and defer if needed + is_slash_command = hasattr(ctx, 'interaction') and ctx.interaction + if is_slash_command: + await ctx.defer() + + # Helper function for sending responses + async def send_response(content=None, embed=None, ephemeral=False): + try: + if is_slash_command: + if embed: + await ctx.followup.send(embed=embed, ephemeral=ephemeral) + else: + await ctx.followup.send(content, ephemeral=ephemeral) + else: + if embed: + await ctx.send(embed=embed) + else: + await ctx.send(content) + except Exception as e: + logger.error(f"Error sending response: {e}") + try: + await ctx.send(f"Error: {e}") + except: + pass + + try: + # Get attachment (either from parameter or message) + attachment = audio_attachment + if not attachment and hasattr(ctx, 'message') and ctx.message and ctx.message.attachments: + attachment = ctx.message.attachments[0] + + if not attachment: + embed = discord.Embed( + title="āŒ No Audio File", + description="Please attach an audio file to convert to text.\n\nSupported formats: `.wav`, `.mp3`, `.m4a`, `.ogg`, `.flac`", + color=0xff0000 + ) + await send_response(embed=embed, ephemeral=True) + return + + # Check file type + supported_formats = ['.wav', '.mp3', '.m4a', '.ogg', '.flac'] + file_extension = os.path.splitext(attachment.filename)[1].lower() + + if file_extension not in supported_formats: + embed = discord.Embed( + title="āŒ Unsupported Format", + description=f"File format `{file_extension}` is not supported.\n\nSupported formats: {', '.join(supported_formats)}", + color=0xff0000 + ) + await send_response(embed=embed, ephemeral=True) + return + + # Check file size (max 25MB for Discord, but we'll be more conservative) + max_size = 10 * 1024 * 1024 # 10MB + if attachment.size > max_size: + embed = discord.Embed( + title="āŒ File Too Large", + description=f"File size ({attachment.size / 1024 / 1024:.1f}MB) exceeds maximum allowed size (10MB).", + color=0xff0000 + ) + await send_response(embed=embed, ephemeral=True) + return + + # Download and process the audio file + embed = discord.Embed( + title="šŸŽ¤ Processing Audio", + description="Downloading and converting audio to text...", + color=0x3498db + ) + await send_response(embed=embed) + + # Create temp directory if it doesn't exist + temp_dir = "temp_audio" + if not os.path.exists(temp_dir): + os.makedirs(temp_dir) + + # Download file + temp_filename = f"{uuid.uuid4()}_{attachment.filename}" + temp_filepath = os.path.join(temp_dir, temp_filename) + + audio_data = await attachment.read() + with open(temp_filepath, 'wb') as f: + f.write(audio_data) + + # Initialize speech recognizer + recognizer = sr.Recognizer() + + try: + # Convert audio to WAV if needed (speech_recognition works best with WAV) + wav_filepath = temp_filepath + if file_extension != '.wav': + from pydub import AudioSegment + wav_filepath = os.path.splitext(temp_filepath)[0] + '.wav' + audio = AudioSegment.from_file(temp_filepath) + audio.export(wav_filepath, format="wav") + + # Process audio file + with sr.AudioFile(wav_filepath) as source: + # Adjust for ambient noise + recognizer.adjust_for_ambient_noise(source, duration=0.5) + audio = recognizer.record(source) + + # Convert speech to text using Google Speech Recognition + try: + text = recognizer.recognize_google(audio, language='en-US') + + if not text.strip(): + embed = discord.Embed( + title="āš ļø No Speech Detected", + description="No speech was detected in the audio file. Please ensure the audio contains clear speech.", + color=0xffa500 + ) + await send_response(embed=embed) + return + + # Create success embed + embed = discord.Embed( + title="šŸŽ¤ Speech to Text Result", + color=0x00ff00, + timestamp=datetime.now() + ) + + # Truncate text if too long for embed + if len(text) > 1000: + embed.add_field( + name="šŸ“ Transcribed Text", + value=text[:1000] + "...", + inline=False + ) + embed.add_field( + name="ā„¹ļø Note", + value=f"Text was truncated. Full length: {len(text)} characters.", + inline=False + ) + else: + embed.add_field( + name="šŸ“ Transcribed Text", + value=text, + inline=False + ) + + embed.add_field( + name="šŸ“Š File Info", + value=f"**Filename:** {attachment.filename}\n**Size:** {attachment.size / 1024:.1f} KB\n**Format:** {file_extension.upper()}", + inline=True + ) + + embed.set_footer(text=f"Processed by {ctx.author.display_name}") + + await send_response(embed=embed) + + # If text is very long, also send as text file + if len(text) > 1500: + text_filename = f"transcription_{ctx.author.id}_{int(time.time())}.txt" + text_filepath = os.path.join(temp_dir, text_filename) + + with open(text_filepath, 'w', encoding='utf-8') as f: + f.write(f"Speech-to-Text Transcription\n") + f.write(f"Original file: {attachment.filename}\n") + f.write(f"Processed by: {ctx.author.display_name}\n") + f.write(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") + f.write(f"\n{'='*50}\n\n") + f.write(text) + + with open(text_filepath, 'rb') as f: + await ctx.send( + "šŸ“„ **Full transcription** (text too long for embed):", + file=discord.File(f, text_filename) + ) + + # Clean up text file + try: + os.remove(text_filepath) + except: + pass + + except sr.UnknownValueError: + embed = discord.Embed( + title="āŒ Speech Not Recognized", + description="Could not understand the speech in the audio file. Please ensure:\n• Audio is clear and not too noisy\n• Speech is in English\n• Audio quality is good", + color=0xff0000 + ) + await send_response(embed=embed) + + except sr.RequestError as e: + embed = discord.Embed( + title="āŒ Recognition Service Error", + description=f"Could not request results from speech recognition service: {e}", + color=0xff0000 + ) + await send_response(embed=embed) + + finally: + # Clean up temporary files + try: + if os.path.exists(temp_filepath): + os.remove(temp_filepath) + if wav_filepath != temp_filepath and os.path.exists(wav_filepath): + os.remove(wav_filepath) + except Exception as e: + logger.warning(f"Could not clean up temp files: {e}") + + except Exception as e: + logger.error(f"Error in speech_to_text command: {e}") + embed = discord.Embed( + title="āŒ Error", + description="An error occurred while processing the audio file. Please try again.", + color=0xff0000 + ) + await send_response(embed=embed) + +@client.hybrid_command() +async def live_speech(ctx, duration: int = 10): + """Records audio from your microphone and converts it to text. + + Usage: + /live_speech [duration in seconds (default: 10, max: 30)] + + Note: You need to be in a voice channel with the bot. + """ + # Check if it's a slash command and defer if needed + is_slash_command = hasattr(ctx, 'interaction') and ctx.interaction + if is_slash_command: + await ctx.defer() + + # Helper function for sending responses + async def send_response(content=None, embed=None, ephemeral=False): + try: + if is_slash_command: + if embed: + await ctx.followup.send(embed=embed, ephemeral=ephemeral) + else: + await ctx.followup.send(content, ephemeral=ephemeral) + else: + if embed: + await ctx.send(embed=embed) + else: + await ctx.send(content) + except Exception as e: + logger.error(f"Error sending response: {e}") + + try: + # Validate duration + if duration < 1: + duration = 1 + elif duration > 30: + duration = 30 + + # Check if user is in voice channel + if not ctx.author.voice: + embed = discord.Embed( + title="āŒ Not in Voice Channel", + description="You need to be in a voice channel to use live speech recognition.", + color=0xff0000 + ) + await send_response(embed=embed, ephemeral=True) + return + + # Start recording notification + embed = discord.Embed( + title="šŸŽ¤ Live Speech Recognition", + description=f"šŸ”“ **Recording for {duration} seconds...**\n\nSpeak clearly into your microphone!", + color=0xff0000 + ) + await send_response(embed=embed) + + # Note: This is a simplified version as Discord bots can't directly access user microphones + # This would require a client-side application or different implementation + embed = discord.Embed( + title="šŸŽ¤ Live Speech Recognition", + description="āš ļø **Feature Note**\n\nLive microphone recording requires additional setup. Use `/speech_to_text` with audio files instead.", + color=0xffa500 + ) + await send_response(embed=embed) + + except Exception as e: + logger.error(f"Error in live_speech command: {e}") + embed = discord.Embed( + title="šŸŽ¤ Live Speech Recognition", + description="āŒ **Error occurred**\n\nCould not process live speech recognition.", + color=0xff0000 + ) + await send_response(embed=embed) + @client.hybrid_command() async def toggle_feature(ctx, feature: str, state: str): """Allows admin to enable or disable bot features."""