modified: bot.py

2025-08-29 17:27:44 +02:00
parent 215d4bd11e
commit 6c3e2d44af
1 changed files with 295 additions and 0 deletions
--- a/bot.py
+++ b/bot.py
@@ -21,6 +21,7 @@ import hashlib
 from datetime import datetime, timedelta
 import concurrent.futures
 from gtts import gTTS
+import speech_recognition as sr
 import shutil
 from bs4 import BeautifulSoup
 from dotenv import load_dotenv
@@ -2506,6 +2507,300 @@ async def leave(ctx):
    else:
        await ctx.send("I am not in a voice channel.")

+@client.hybrid_command()
+async def speech_to_text(ctx, *, audio_attachment: discord.Attachment = None):
+    """Converts audio file to text using speech recognition.
+    
+    Usage:
+    /speech_to_text [attach audio file]
+    
+    Supported formats: .wav, .mp3, .m4a, .ogg, .flac
+    """
+    # Check if it's a slash command and defer if needed
+    is_slash_command = hasattr(ctx, 'interaction') and ctx.interaction
+    if is_slash_command:
+        await ctx.defer()
+    
+    # Helper function for sending responses
+    async def send_response(content=None, embed=None, ephemeral=False):
+        try:
+            if is_slash_command:
+                if embed:
+                    await ctx.followup.send(embed=embed, ephemeral=ephemeral)
+                else:
+                    await ctx.followup.send(content, ephemeral=ephemeral)
+            else:
+                if embed:
+                    await ctx.send(embed=embed)
+                else:
+                    await ctx.send(content)
+        except Exception as e:
+            logger.error(f"Error sending response: {e}")
+            try:
+                await ctx.send(f"Error: {e}")
+            except:
+                pass
+
+    try:
+        # Get attachment (either from parameter or message)
+        attachment = audio_attachment
+        if not attachment and hasattr(ctx, 'message') and ctx.message and ctx.message.attachments:
+            attachment = ctx.message.attachments[0]
+        
+        if not attachment:
+            embed = discord.Embed(
+                title="❌ No Audio File",
+                description="Please attach an audio file to convert to text.\n\nSupported formats: `.wav`, `.mp3`, `.m4a`, `.ogg`, `.flac`",
+                color=0xff0000
+            )
+            await send_response(embed=embed, ephemeral=True)
+            return
+        
+        # Check file type
+        supported_formats = ['.wav', '.mp3', '.m4a', '.ogg', '.flac']
+        file_extension = os.path.splitext(attachment.filename)[1].lower()
+        
+        if file_extension not in supported_formats:
+            embed = discord.Embed(
+                title="❌ Unsupported Format",
+                description=f"File format `{file_extension}` is not supported.\n\nSupported formats: {', '.join(supported_formats)}",
+                color=0xff0000
+            )
+            await send_response(embed=embed, ephemeral=True)
+            return
+        
+        # Check file size (max 25MB for Discord, but we'll be more conservative)
+        max_size = 10 * 1024 * 1024  # 10MB
+        if attachment.size > max_size:
+            embed = discord.Embed(
+                title="❌ File Too Large",
+                description=f"File size ({attachment.size / 1024 / 1024:.1f}MB) exceeds maximum allowed size (10MB).",
+                color=0xff0000
+            )
+            await send_response(embed=embed, ephemeral=True)
+            return
+        
+        # Download and process the audio file
+        embed = discord.Embed(
+            title="🎤 Processing Audio",
+            description="Downloading and converting audio to text...",
+            color=0x3498db
+        )
+        await send_response(embed=embed)
+        
+        # Create temp directory if it doesn't exist
+        temp_dir = "temp_audio"
+        if not os.path.exists(temp_dir):
+            os.makedirs(temp_dir)
+        
+        # Download file
+        temp_filename = f"{uuid.uuid4()}_{attachment.filename}"
+        temp_filepath = os.path.join(temp_dir, temp_filename)
+        
+        audio_data = await attachment.read()
+        with open(temp_filepath, 'wb') as f:
+            f.write(audio_data)
+        
+        # Initialize speech recognizer
+        recognizer = sr.Recognizer()
+        
+        try:
+            # Convert audio to WAV if needed (speech_recognition works best with WAV)
+            wav_filepath = temp_filepath
+            if file_extension != '.wav':
+                from pydub import AudioSegment
+                wav_filepath = os.path.splitext(temp_filepath)[0] + '.wav'
+                audio = AudioSegment.from_file(temp_filepath)
+                audio.export(wav_filepath, format="wav")
+            
+            # Process audio file
+            with sr.AudioFile(wav_filepath) as source:
+                # Adjust for ambient noise
+                recognizer.adjust_for_ambient_noise(source, duration=0.5)
+                audio = recognizer.record(source)
+            
+            # Convert speech to text using Google Speech Recognition
+            try:
+                text = recognizer.recognize_google(audio, language='en-US')
+                
+                if not text.strip():
+                    embed = discord.Embed(
+                        title="⚠️ No Speech Detected",
+                        description="No speech was detected in the audio file. Please ensure the audio contains clear speech.",
+                        color=0xffa500
+                    )
+                    await send_response(embed=embed)
+                    return
+                
+                # Create success embed
+                embed = discord.Embed(
+                    title="🎤 Speech to Text Result",
+                    color=0x00ff00,
+                    timestamp=datetime.now()
+                )
+                
+                # Truncate text if too long for embed
+                if len(text) > 1000:
+                    embed.add_field(
+                        name="📝 Transcribed Text",
+                        value=text[:1000] + "...",
+                        inline=False
+                    )
+                    embed.add_field(
+                        name="ℹ️ Note",
+                        value=f"Text was truncated. Full length: {len(text)} characters.",
+                        inline=False
+                    )
+                else:
+                    embed.add_field(
+                        name="📝 Transcribed Text",
+                        value=text,
+                        inline=False
+                    )
+                
+                embed.add_field(
+                    name="📊 File Info",
+                    value=f"**Filename:** {attachment.filename}\n**Size:** {attachment.size / 1024:.1f} KB\n**Format:** {file_extension.upper()}",
+                    inline=True
+                )
+                
+                embed.set_footer(text=f"Processed by {ctx.author.display_name}")
+                
+                await send_response(embed=embed)
+                
+                # If text is very long, also send as text file
+                if len(text) > 1500:
+                    text_filename = f"transcription_{ctx.author.id}_{int(time.time())}.txt"
+                    text_filepath = os.path.join(temp_dir, text_filename)
+                    
+                    with open(text_filepath, 'w', encoding='utf-8') as f:
+                        f.write(f"Speech-to-Text Transcription\n")
+                        f.write(f"Original file: {attachment.filename}\n")
+                        f.write(f"Processed by: {ctx.author.display_name}\n")
+                        f.write(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+                        f.write(f"\n{'='*50}\n\n")
+                        f.write(text)
+                    
+                    with open(text_filepath, 'rb') as f:
+                        await ctx.send(
+                            "📄 **Full transcription** (text too long for embed):",
+                            file=discord.File(f, text_filename)
+                        )
+                    
+                    # Clean up text file
+                    try:
+                        os.remove(text_filepath)
+                    except:
+                        pass
+                
+            except sr.UnknownValueError:
+                embed = discord.Embed(
+                    title="❌ Speech Not Recognized",
+                    description="Could not understand the speech in the audio file. Please ensure:\n• Audio is clear and not too noisy\n• Speech is in English\n• Audio quality is good",
+                    color=0xff0000
+                )
+                await send_response(embed=embed)
+                
+            except sr.RequestError as e:
+                embed = discord.Embed(
+                    title="❌ Recognition Service Error",
+                    description=f"Could not request results from speech recognition service: {e}",
+                    color=0xff0000
+                )
+                await send_response(embed=embed)
+        
+        finally:
+            # Clean up temporary files
+            try:
+                if os.path.exists(temp_filepath):
+                    os.remove(temp_filepath)
+                if wav_filepath != temp_filepath and os.path.exists(wav_filepath):
+                    os.remove(wav_filepath)
+            except Exception as e:
+                logger.warning(f"Could not clean up temp files: {e}")
+        
+    except Exception as e:
+        logger.error(f"Error in speech_to_text command: {e}")
+        embed = discord.Embed(
+            title="❌ Error",
+            description="An error occurred while processing the audio file. Please try again.",
+            color=0xff0000
+        )
+        await send_response(embed=embed)
+
+@client.hybrid_command()
+async def live_speech(ctx, duration: int = 10):
+    """Records audio from your microphone and converts it to text.
+    
+    Usage:
+    /live_speech [duration in seconds (default: 10, max: 30)]
+    
+    Note: You need to be in a voice channel with the bot.
+    """
+    # Check if it's a slash command and defer if needed
+    is_slash_command = hasattr(ctx, 'interaction') and ctx.interaction
+    if is_slash_command:
+        await ctx.defer()
+    
+    # Helper function for sending responses
+    async def send_response(content=None, embed=None, ephemeral=False):
+        try:
+            if is_slash_command:
+                if embed:
+                    await ctx.followup.send(embed=embed, ephemeral=ephemeral)
+                else:
+                    await ctx.followup.send(content, ephemeral=ephemeral)
+            else:
+                if embed:
+                    await ctx.send(embed=embed)
+                else:
+                    await ctx.send(content)
+        except Exception as e:
+            logger.error(f"Error sending response: {e}")
+
+    try:
+        # Validate duration
+        if duration < 1:
+            duration = 1
+        elif duration > 30:
+            duration = 30
+        
+        # Check if user is in voice channel
+        if not ctx.author.voice:
+            embed = discord.Embed(
+                title="❌ Not in Voice Channel",
+                description="You need to be in a voice channel to use live speech recognition.",
+                color=0xff0000
+            )
+            await send_response(embed=embed, ephemeral=True)
+            return
+        
+        # Start recording notification
+        embed = discord.Embed(
+            title="🎤 Live Speech Recognition",
+            description=f"🔴 **Recording for {duration} seconds...**\n\nSpeak clearly into your microphone!",
+            color=0xff0000
+        )
+        await send_response(embed=embed)
+        
+        # Note: This is a simplified version as Discord bots can't directly access user microphones
+        # This would require a client-side application or different implementation
+        embed = discord.Embed(
+            title="🎤 Live Speech Recognition",
+            description="⚠️ **Feature Note**\n\nLive microphone recording requires additional setup. Use `/speech_to_text` with audio files instead.",
+            color=0xffa500
+        )
+        await send_response(embed=embed)
+        
+    except Exception as e:
+        logger.error(f"Error in live_speech command: {e}")
+        embed = discord.Embed(
+            title="🎤 Live Speech Recognition",
+            description="❌ **Error occurred**\n\nCould not process live speech recognition.",
+            color=0xff0000
+        )
+        await send_response(embed=embed)
+
@client.hybrid_command()
 async def toggle_feature(ctx, feature: str, state: str):
    """Allows admin to enable or disable bot features."""