@InProceedings{sak-et-al-gotal-08,
	Abstract = {In this paper, we propose a set of language resources
	for building Turkish language processing applications.
	Specifically, we present a finite-state implementation of a morphological parser,
	an averaged perceptron-based morphological disambiguator,
	and compilation of a web corpus. Turkish is an agglutinative
	language with a highly productive inflectional and derivational morphology.
	We present an implementation of a morphological parser based on two-level morphology.
	This parser is one of the most complete parsers for Turkish
	and it runs independent of any other external system
	such as PC-KIMMO in contrast to existing parsers.
	Due to complex phonology and morphology of Turkish, parsing introduces some ambiguous parses.
	We developed a morphological disambiguator with accuracy of about 98%
	using averaged perceptron algorithm. We also present our efforts
	to build a Turkish web corpus of about 423 million words.},
	Author = {Ha{\c s}im Sak and Tunga G{\"u}ng{\"o}r and Murat Sara{\c c}lar},
	Booktitle = {GoTAL 2008},
	Pages = {417--427},
	Title = {Turkish Language Resources: Morphological Parser, Morphological Disambiguator and Web Corpus},
	Volume = {5221},
	Series = {LNCS},
	Publisher = {Springer},
	Year = {2008}
}
		

@inproceedings{arisoy-et-al-interspeech-07,
	Abstract = {The aim of this study is to develop a speech recognition system 
	for Turkish broadcast news. State-of-the-art speech recognition 
	systems utilize statistical models. A large amount of data is required
	to reliably estimate these models. For this study, a large 
	Turkish Broadcast News database, consisting of the speech signal
	and corresponding transcriptions, is being collected. In this 
	paper, information about this database and experiments performed
	using the system developed on the collected data are 
	presented. In addition to the baseline system, various sub-word 
	language models are investigated. Lexical stem-endings are 
	proposed as a novel unit for language modeling and are shown 
	to perform better than surface stem-endings and morphs. Currently,
	our best systems have lower than 20% error on clean speech.},
	Author = {Ebru Ar{\i }soy and Ha{\c s}im Sak and Murat Sara{\c c}lar},
	Booktitle = {Proceedings of Interspeech 2007 - Eurospeech (To appear)},
	Title = {Language Modeling for Automatic {Turkish} Broadcast News Transcription},
	Year = {2007},
}


@inproceedings{sak-et-al-cicling-07,
	Abstract = {This paper describes the application of the perceptron algorithm
	to the morphological disambiguation of Turkish text. Turkish has 
	a productive derivational morphology. Due to the ambiguity caused by 
	complex morphology, a word may have multiple morphological parses, 
	each with a different stem or sequence of morphemes. The methodology
	employed is based on ranking with perceptron algorithm which has 
	been successful in some NLP tasks in English. We use a baseline statistical
	trigram-based model of a previous work to enumerate an n-best list 
	of candidate morphological parse sequences for each sentence. We then 
	apply the perceptron algorithm to rerank the n-best list using a set of 
	23 features. The perceptron trained to do morphological disambiguation 
	improves the accuracy of the baseline model from 93.61% to 96.80%. 
	When we train the perceptron as a POS tagger, the accuracy is 98.27%. 
	Turkish morphological disambiguation and POS tagging results that we 
	obtained is the best reported so far.},
	Author = {Ha{\c s}im Sak and Tunga G{\"u}ng{\"o}r and Murat Sara{\c c}lar},
	Booktitle = {CICLing 2007},
	Pages = {107--118},
	Title = {Morphological Disambiguation of {Turkish} Text with Perceptron Algorithm},
	Volume = {LNCS 4394},
	Year = {2007},
	Url = {http://www.cmpe.boun.edu.tr/~hasim/papers/CICLing07.pdf}}


@article{sak-et-al-turkjelec-06,
	Abstract = {Speech synthesis is the process of converting written text into machine-generated synthetic speech. 
	Concatenative speech synthesis systems form utterances by concatenating pre-recorded speech units. 
	Corpus-based methods use a large inventory to select the units to be concatenated. In this paper, we 
	design and develop an intel ligible and natural sounding corpus-based concatenative speech synthesis system
	for the Turkish language. The implemented system contains a front-end comprised of text analysis, 
	phonetic analysis, and optional use of transplanted prosody. The unit selection algorithm is based on 
	commonly used Viterbi decoding algorithm of the best-path in the network of the speech units using spectral
	discontinuity and prosodic mismatch objective cost measures. The back-end is the speech waveform 
	generation based on the harmonic coding of speech and overlap-and-add mechanism. Harmonic coding 
	enabled us to compress the unit inventory size by a factor of three. In this study, a Turkish phoneme 
	set has been designed and a pronunciation lexicon for root words has been constructed. The importance 
	of prosody in unit selection has been investigated by using transplanted prosody. A Turkish Diagnostic 
	Rhyme Test (DRT) word list that can be used to evaluate the intel ligibility of Turkish Text-to-Speech 
	(TTS) systems has been compiled. Several experiments have been performed to evaluate the quality of the 
	synthesized speech and we obtained 4.2 Mean Opinion Score (MOS) in the listening tests for our system, 
	which is the first unit selection based system published for Turkish.},
	Author = {Ha{\c s}im Sak and Tunga G{\"u}ng{\"o}r and Ya{\c s}ar Safkan},
	Date-Modified = {2007-07-29 19:01:48 +0300},
	Journal = {Turkish Journal of Electrical Engineering and Computer Sciences},
	Number = {2},
	Pages = {209--223},
	Title = {A Corpus-Based Concatenative Speech Synthesis System for {Turkish}},
	Volume = {14},
	Year = {2006},
	Url = {http://www.cmpe.boun.edu.tr/~hasim/papers/TurkJElecEngin06.pdf}}


@inproceedings{sak-et-al-eusipco-05,
	Abstract = {In this paper, we design and develop an intelligible and natural 
	sounding corpus-based concatenative speech synthesis system for 
	Turkish. The implemented system contains a front-end comprised 
	of text analysis, phonetic analysis, and optional use of transplanted 
	prosody. The unit selection algorithm is based on commonly used 
	Viterbi decoding algorithm. The back-end is the speech waveform 
	generation based on the harmonic coding of speech and overlap-and-add
	mechanism. In this study, a Turkish phoneme set has been 
	designed and a pronunciation lexicon for root words has been 
	constructed. For assessing the intelligibility of the synthesized 
	speech, a DRT word list for Turkish has been compiled. The 
	developed system obtained 4.2 Mean Opinion Score (MOS) in the 
	listening tests.},
	Author = {Ha{\c s}im Sak and Tunga G{\"u}ng{\"o}r and Ya{\c s}ar Safkan},
	Booktitle = {13th European Signal Processing Conference (EUSIPCO 2005)},
	Title = {Generation of Synthetic Speech from {Turkish} Text},
	Year = {2005},
	Url = {http://www.cmpe.boun.edu.tr/~hasim/papers/EUSIPCO05.pdf}}	


@mastersthesis{sak-msthesis-04,
	Abstract = {Speech synthesis (text-to-speech) is the process of converting the written text 
	into machine generated synthetic speech. Concatenative speech synthesis systems render
	speech by concatenating pre-recorded speech units. Corpus-based methods (unit 
	selection) use a large inventory to select the units and concatenate. This thesis is part 
	of an effort to design and develop an intelligible and natural sounding corpus-based 
	concatenative speech synthesis system for Turkish. The implemented system contains 
	a relatively simple front-end comprised of text analysis, phonetic analysis, and optional 
	use of transplanted prosody. The unit selection algorithm is based on commonly used 
	Viterbi decoding algorithm of the best path in the network of the units. The back-end 
	is the speech waveform generation based on the harmonic coding of speech and overlapand-add mechanism.
	In this work, the different unit sizes such as syllables, phones and 
	half-phones have been experimented with. Speech corpus design and recording script 
	preparation methods have been explained. A speech model based on harmonic coding 
	of speech has been developed for speech representation and waveform generation. The 
	harmonic coding has enabled us to compress the unit inventory size by a factor of three. 
	A Viterbi decoding algorithm using spectral discontinuity cost and prosodic mismatch 
	ob jective cost measures has been implemented. A Turkish phoneme set has been designed.
	Text-to-phoneme conversion for Turkish has been worked on, and a root words 
	pronunciation lexicon has been constructed. A simple text normalization module has 
	been implemented. The importance of prosody in unit selection has been studied by 
	using transplanted prosody vs no synthetic prosody modeling in unit selection. Subjective
	tests have been carried out for evaluating the synthesized speech quality. The 
	final Turkish speech synthesis system got 4.2 MOS like score in the listening tests.},
	Author = {Ha{\c s}im Sak},
	Date-Modified = {2007-07-29 19:20:05 +0300},
	School = {Bo{\u g}azi{\c c}i University},
	Title = {A Corpus-Based Concatenative Speech Synthesis System for {Turkish}},
	Year = {2004},
	Url = {http://www.cmpe.boun.edu.tr/~hasim/papers/MSThesis.pdf}}