This is a simple voice chatbot to speak to ChatGPT and listen to the response in continuous mode. Uses ChatGPT4o-mini, TTS, and speech recognition.
Introduction
This article aims to show how to create a web page that allows the user to chat with ChatGPT in continuous mode, interrogating the text model and receiving a spoken answer, by using speech recognition and the TTS model.
Background
Using the code
The project is realized in HTML/CSS and plain Vanilla JS; the front end is made up of a simple structure with:
- two input fields for the API Key and for the prompt
- an Icon to start the speech recognition
- a div where the written answer will appear
- an (invisible) audio player
<header>
<h1>AI Voice Chatbot</h1>
</header>
<div class="container">
<form action="#" method="get" target="_blank" id="action-form">
<input type="text" id="apikey" placeholder="Insert your API key here">
<input id="prompt" type="text" placeholder="Activate Microphone to chat..." autocomplete="off" autofocus>
</form>
<div id="chathistory"> </div>
<p class="info"></p>
<audio controls id="audioPlayer" style="display: none;"></audio>
</div>
The JavaScript file is mainly made up of 3 sections:
- speech recognition functions/events
- request to ChatGPT4o-mini to get an answer
- request to TTS to generate the audio file
We'll first have a look at the speech recognition section. This is based on capturing some recognition events (start, end, result):
let recognition;
const SpeechRecognition =
window.SpeechRecognition || window.webkitSpeechRecognition;
if (SpeechRecognition) {
console.log("Your Browser supports speech Recognition");
recognition = new SpeechRecognition();
recognition.continuous = true;
let idleTimer;
actionForm.insertAdjacentHTML(
"beforeend",
'<button type="button"><i class="fas fa-microphone"></i></button>'
);
actionFormInput.style.paddingRight = "50px";
const micBtn = actionForm.querySelector("button");
const micIcon = micBtn.firstElementChild;
micBtn.addEventListener("click", micBtnClick);
function micBtnClick() {
if (micIcon.classList.contains("fa-microphone")) {
recognition.start();
} else {
recognition.stop();
}
}
recognition.addEventListener("start", startSpeechRecognition);
function startSpeechRecognition() {
micIcon.classList.remove("fa-microphone");
micIcon.classList.add("fa-microphone-slash");
actionFormInput.focus();
console.log("Voice activated, SPEAK");
clearTimeout(idleTimer);
}
recognition.addEventListener("end", endSpeechRecognition);
function endSpeechRecognition() {
micIcon.classList.remove("fa-microphone-slash");
micIcon.classList.add("fa-microphone");
actionFormInput.focus();
console.log("Speech recognition service disconnected");
}
recognition.addEventListener("result", resultOfSpeechRecognition);
function resultOfSpeechRecognition(event) {
const current = event.resultIndex;
const transcript = event.results[current][0].transcript;
const timestamp = new Date().toLocaleTimeString();
const message = `${timestamp} - Guest: ${transcript}`;
if (transcript.toLowerCase().trim() === "go") {
recognition.stop();
} else {
clearTimeout(idleTimer);
idleTimer = setTimeout(() => {
recognition.stop();
}, 2000);
}
sendMessage(transcript);
}
We then create a context memory for the chatbot, using an array system:
let chatMemory = [];
chatMemory = createMemory([
{
role: "system",
content: "You are a funny bot."
}
]);
console.log(chatMemory);
function createMemory(messages) {
const memory = [];
for (const msg of messages) {
memory.push({ role: msg.role, content: msg.content });
}
return memory;
}
Then we have two functions to send the message to OpenAI ChatGPT4o-mini model and to show the resulting response, with full token count and cost estimate.
async function sendMessage(transcript) {
const apikey = document.getElementById("apikey").value;
console.log(apikey);
if (apikey === "") {
alert("No OpenAI API Key found.");
} else {
console.log(apikey);
}
const userInput = transcript;
console.log(userInput);
if (userInput !== "") {
showMessage("Guest", userInput, "");
chatMemory = await getChatGPTResponse(userInput, chatMemory);
}
}
function showMessage(sender, message, tokens, downloadLink) {
const messageElement = document.createElement("div");
if (sender === "Guest") {
messageElement.innerHTML = `${sender}: ${message}`;
messageElement.classList.add("user-message");
} else {
const timestampElement = document.createElement("p");
timestampElement.innerHTML = `${sender}: ${message} `;
timestampElement.classList.add("chatgpt-message");
messageElement.appendChild(timestampElement);
const separator = document.createElement("p");
separator.innerHTML = `${tokens}`;
messageElement.classList.add("chatgpt-message");
messageElement.appendChild(separator);
const downloadElem = document.createElement("div");
downloadElem.innerHTML = downloadLink;
messageElement.appendChild(downloadElem);
}
chatContainer.appendChild(messageElement);
chatContainer.scrollTop = chatContainer.scrollHeight;
}
Finally, we have the first OpenAI interrogation:
async function getChatGPTResponse(userInput, chatMemory = []) {
const apikey = document.getElementById("apikey").value;
console.log(apikey);
if (apikey === "") {
alert("No OpenAI API Key found.");
} else {
console.log(apikey);
}
const chatContainer = document.getElementById("chathistory");
try {
const response = await fetch("https://api.openai.com/v1/chat/completions", {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: "Bearer " + apikey
},
body: JSON.stringify({
model: "gpt-4o-mini",
messages: [...chatMemory, { role: "user", content: userInput }]
})
});
if (!response.ok) {
throw new Error("Error while requesting to the API");
}
const data = await response.json();
if (
!data.choices ||
!data.choices.length ||
!data.choices[0].message ||
!data.choices[0].message.content
) {
throw new Error("Invalid API response");
}
const chatGPTResponse = data.choices[0].message.content.trim();
var cleanResponse = chatGPTResponse.replace(
/(```html|```css|```javascript|```php|```python|```vb|```vb.net|cpp|java|csharp)(.*?)/gs,
"$2"
);
console.log(chatGPTResponse);
cleanResponse = cleanResponse.replace(/```/g, "");
cleanResponse = cleanResponse.replace(/\*\*(.*?)\*\*/g, "$1");
const tokenCount = document.createElement("p");
if (data.usage.completion_tokens) {
const requestTokens = data.usage.prompt_tokens;
const responseTokens = data.usage.completion_tokens;
const totalTokens = data.usage.total_tokens;
const pricepertokenprompt = 0.15 / 1000000;
const pricepertokenresponse = 0.6 / 1000000;
const priceperrequest = pricepertokenprompt * requestTokens;
const priceperresponse = pricepertokenresponse * responseTokens;
const totalExpense = priceperrequest + priceperresponse;
tokenCount.innerHTML = `<hr>Your request used ${requestTokens} tokens and costed ${priceperrequest.toFixed(
6
)}USD<br>This response used ${responseTokens} tokens and costed ${priceperresponse.toFixed(
6
)}USD<br>Total Tokens: ${totalTokens}. This interaction costed you: ${totalExpense.toFixed(
6
)}USD (audio not included).`;
} else {
tokenCount.innerHTML = "Unable to track the number of used tokens.";
}
const blob = new Blob([cleanResponse], { type: "text/html" });
const url = URL.createObjectURL(blob);
const downloadLink = `<a href="${url}" download="chat.txt">Click here to download the generated answer</a>`;
showMessage(
"VivacityGPT",
cleanResponse,
tokenCount.innerHTML,
downloadLink
);
convertiTestoInAudio(cleanResponse);
chatMemory.push({ role: "user", content: userInput });
chatMemory.push({ role: "assistant", content: cleanResponse });
return chatMemory;
} catch (error) {
console.error(error);
alert(
"An error occurred during the request. Check your OpenAI account or retry later."
);
}
}
Now we have the last function, to interrogate TTS:
function convertiTestoInAudio(response) {
const apikey = document.getElementById("apikey").value;
console.log(apikey);
const prompt = response;
const selectedvoice = "nova";
if (prompt) {
fetch("https://api.openai.com/v1/audio/speech", {
method: "POST",
headers: {
Authorization: `Bearer ${apikey}`,
"Content-Type": "application/json"
},
body: JSON.stringify({
model: "tts-1",
input: prompt,
voice: selectedvoice
})
})
.then((response) => response.blob())
.then((blob) => {
const audioUrl = URL.createObjectURL(blob);
const audioPlayer = document.getElementById("audioPlayer");
audioPlayer.src = audioUrl;
audioPlayer.play();
audioPlayer.addEventListener("ended", () => {
recognition.start();
});
})
.catch((error) => {
console.error("Error while converting TTS: ", error);
});
} else {
alert("Please insert a text prompt before converting.");
}
}
Points of Interest
There are mainly two points of interest in this code:
- The speech recognition runs in loop, meaning that it is stopped after recognition is accomplished, and restarted after the audioplayer finishes playing the TTS generated audio. This ensures continuous chatting without having to click on the microphone icon for every interaction. Just click on it to start chatting and click back on it when you want to finish. This is achieved via an event control in the TTS call:
audioPlayer.addEventListener("ended", () => {
recognition.start();
- All the functions are intertwined to create a fluid experience: the voice
recognition.result
event calls the sendMessage()
function, that calls in async mode the getChatGPTResponse()
function, that calls the convertiTestoInAudio()
function, that calls recogntition.start
.
As additional points of interest I would specify that speech recognition is allowed only locally or on connections that have a secure connection (SSL certificate), and that the chatbot has been set with a 'funny chatbot' personality to make chats less boring. Customizing the system prompt will allow any personality / mood change.
History
First version.