@INPROCEEDINGS{SonRaeSch05a, author = {S{\"oren} Sonnenburg and Gunnar R{\"a}tsch and Bernhard Sch{\"o}lkopf}, title = {Large Scale Genomic Sequence {SVM} Classifiers}, booktitle = {Proceedings of the 22nd International Machine Learning Conference}, editors = {Luc De Raedt, Stefan Wrobel}, publisher = {ACM Press}, year = {2005}, ps = {http://sonnenburgs.de/soeren/publications/SonRaeSch05a.ps.gz}, pdf = {http://sonnenburgs.de/soeren/publications/SonRaeSch05a.pdf}, abstract = { In genomic sequence analysis tasks like splice site recognition or promoter identification, large amounts of training sequences are available, and indeed needed to achieve sufficiently high classification performances. In this work we study two recently proposed and successfully used kernels, namely the Spectrum kernel and the Weighted Degree kernel (WD). In particular, we suggest several extensions using Suffix Trees and modifications of an SMO-like SVM training algorithm in order to accelerate the training of the SVMs and their evaluation on test sequences. Our simulations show that for the spectrum kernel and WD kernel, large scale SVM training can be accelerated by factors of 20 and 4 times, respectively, while using much less memory (e.g. no kernel caching). The evaluation on new sequences is often several thousand times faster using the new techniques (depending on the number of Support Vectors). Our method allows us to train on sets as large as one million sequences.} }