Introduction
It's interesting to see how much free tools Google started to provide to people thanks to it's massive cloud computing capability.
I was so blown away by sound quality that I created this simple program to read my favorite eBooks aloud. So far there are 7 different language voices in exceptional quality
English French Italian Spanish German Czech Haitian-Creole Hindi
And unfortunately 27 [sub-par] quality voices that got recently integrated via 3th party OpenSource ESpeech engine.
Afrikaans, Albanian, Catalan, Chinese (Mandarin), Croatian, Danish, Dutch, Finnish, Greek, Hungarian, Icelandic, Indonesian, Latvian, Macedonian, Norwegian, Polish, Portuguese, Romanian, Russian, Serbian, Slovak, Swahili, Swedish, Turkish, Vietnamese, Welsh.
Google keeps replacing them with more quality versions as time progresses. As was lately added Czech language.
Let Google TTS say your text of chosen language via simple URL
http://translate.google.com/translate_tts?tl=en&q=hello+world
Yes. It's the same service that is integrated to Google's Android and powers pronunciation in Google's translate.
Anyway even though it's web based service. It's free and it's sending you back mp3 with TTS that is for some languages light years ahead of most paid for TTS engines.
Let Google Translate detect language of your text
http:
What we receive is detected language which we in turn use to let TTS know which voice we wana hear. notice sl=auto. it denotes "source language" parameter autodetection
Yes google language detection is often unreliable as you can see on official google translate page. So you better set language in your app manually but it's interesting feature to test anyway.
Code
The Code is slightly larger because we need to detect language per line + split text to max 100 char chunks and send it as URL encoded HTTP GET request. Google sends back mp3 file which we stream as it's received thanks to DirectShow streaming nature and installed mp3 codec. This is minimal sample so you can focus on how it works. Unimportant code like hook is folded in snippet form but feel free to unfold and format the code in the way you like. Replace all static buffers if you plan to use code safely plus cleanup and more robust error handling was left out s you can focus on important parts but still a lot of fun.
So enjoy ;)
#include <windows.h>
#include <shlwapi.h>
#include <Richedit.h>
#include <dshow.h>
#include <winsock.h>
#pragma comment(lib,"Strmiids.lib")
#pragma comment(lib,"Shlwapi.lib")
#pragma comment(lib,"wsock32.lib")
#define DsHook(a,b,c) if (!c##_) {
INT_PTR* p=b+*(INT_PTR**)a; VirtualProtect(&c##_,4,PAGE_EXECUTE_READWRITE,&no);
*(INT_PTR*)&c##_=*p; VirtualProtect(p,4,PAGE_EXECUTE_READWRITE,&no); *p=(INT_PTR)c; }
HRESULT ( __stdcall * SyncReadAlligned_ ) ( void* inst, IMediaSample *smp ) ; HANDLE out;
HRESULT __stdcall SyncReadAlligned ( void* inst, IMediaSample *smp ) {
HRESULT ret = SyncReadAlligned_ ( inst, smp );
BYTE* buf; smp->GetPointer(&buf);
DWORD len = smp->GetActualDataLength(),no; WriteFile(out,buf,len,&no,0);
return ret;
}
int WINAPI WinMain(HINSTANCE inst,HINSTANCE prev,LPSTR cmd,int show) {
MSG msg={0}; WSADATA wsa; DWORD no; HRESULT hr;
CoInitialize(0); WSAStartup(MAKEWORD(1,1),&wsa); LoadLibraryA("RichEd20");
SOCKET s=socket(AF_INET,SOCK_STREAM,IPPROTO_TCP); sockaddr_in addr={AF_INET,htons(80)};
HOSTENT* dns=gethostbyname("translate.google.com"); memcpy(&addr.sin_addr.s_addr,dns->h_addr,4);
if(connect(s,(sockaddr*)&addr,sizeof(addr)) != 0) return 0;
HWND hwnd = CreateWindowA("RICHEDIT20W",0,WS_SIZEBOX|ES_MULTILINE|WS_VISIBLE|ES_AUTOVSCROLL|ES_AUTOHSCROLL|WS_SYSMENU|WS_CAPTION|WS_MINIMIZE|WS_HSCROLL|WS_VSCROLL,500,500,500,300,0,0,0,0);
while ( IsWindowVisible(hwnd) ) {
if( PeekMessage(&msg,0,0,0,1) ) { TranslateMessage( &msg ); DispatchMessage( &msg ); }
if( msg.wParam==VK_RETURN && msg.message == WM_KEYDOWN ) {
DWORD len = 2+GetWindowTextLength(hwnd)*2; CHARRANGE ch={0,-1}; SendMessage(hwnd,EM_EXSETSEL,0,(LPARAM)&ch);
WCHAR* Txt = (WCHAR*)calloc(len,1),*e,*txt=Txt; SendMessage(hwnd,EM_GETSELTEXT,0,(LPARAM)Txt); ch.cpMin=-1;
SendMessage(hwnd,EM_EXSETSEL,0,(LPARAM)&ch);
out = CreateFile("c:/out.mp3",GENERIC_WRITE,FILE_SHARE_READ,0,CREATE_ALWAYS,0,0);
while(*txt) {
if((e=wcschr(txt,L'.'))) *e=0;
if(wcslen(txt)>100 &&(e=wcschr(txt,L','))) *e=0;
if(wcslen(txt)>100) { e=txt+100; while(*e!=L' ') e--; *e=0; }
char utf[1000],esc[1000]={0},*a,*b=utf; WideCharToMultiByte(CP_UTF8,0,txt,-1,utf,1000,0,0);
while(*b) sprintf(esc+strlen(esc),"%%%0.2x",*(BYTE*)b++); txt+=wcslen(txt)+1;
char buf[1000]; sprintf(buf,"GET /translate_a/t?client=t&sl=auto&text=%s HTTP/1.1\r\nUser-Agent: Mozilla/5.0\r\n\r\n\r\n\r\n",esc);
send(s,buf,strlen(buf),0); recv(s,buf,sizeof(buf),0); char lng[3]={"en"}; if((a=strstr(buf,"]],,\""))) memcpy(lng,a+5,2);
IGraphBuilder* graph= 0; CoCreateInstance( CLSID_FilterGraph, 0, CLSCTX_INPROC,IID_IGraphBuilder, (void **)&graph );
IMediaControl* ctrl = 0; graph->QueryInterface( IID_IMediaControl, (void **)&ctrl );
IMediaEvent* event= 0; graph->QueryInterface( IID_IMediaEventEx, (void **)&event );
WCHAR url[1000]; wsprintfW(url,L"http://translate.google.com/translate_tts?tl=%S&q=%S",lng,esc);
if((hr=ctrl->RenderFile(url))) continue;
IBaseFilter* filter; graph->FindFilterByName(url,&filter);
IPin* pin; filter->FindPin(L"Output",&pin);
IAsyncReader* reader; pin->QueryInterface(IID_IAsyncReader,(void**)&reader);
DsHook(reader,6,SyncReadAlligned);
hr=ctrl->Run(); long code=0,c;
while( code != EC_COMPLETE ) {
if( PeekMessage(&msg,0,0,0,1) ) { TranslateMessage( &msg ); DispatchMessage( &msg ); } event->GetEvent(&code, &c, &c, 0);
Sleep(1);
}
ctrl->Release(); event->Release(); graph->Release();
}
free(Txt);
CloseHandle(out);
}
}
}
Points of Interest
Notice that we are passing web address directly to DirectShow. RenderFile() call actually generates whole graph including stream splitter, mp3 decoder and output to sound device.
This simple trick allows us to for example listen to internet radios etc without much work. It requires you to have at least some mp3 codec installed. Which most of you probably have. And if not. Then install ffdshow which is free multicodec that plays pretty much everything you throw at it.
Another thing is that the correct way to grab received data would be to implement and connect sample grabber filter between src and splitter filter. But Then that would require you to use DirectShow SDK which is pretty complicated thing to make compilable and hardly beats implementing just one procedure.
Unknown languages are not played but you can make a lot of substitutions like let's say "nl" do "de" etc. Mix sentences in different languages just for fun ;)
History
- 28.3 first version
- 31.3 combo box for manual language selection replaced by automatic language detection (per sentence)
- 3.4 added capturing of received stream to mp3 file on disk
- 15.5 added info that 29 new languages are synthesized now. Poor quality thou.
- 16.5 changed code to Unicode and uploaded fixed exe so Chinese Hindi etc works now
- 2.6.2011 uses DNS instead of IP + updated language detection to reflect Google changes. added source to zip