Publications

On the bottom you have access to the archive of SeqAn-related publications. Please see Google Scholar for a complete list of applications that cite SeqAn.

If you use SeqAn in any of your academic works, please cite the latest SeqAn paper:

Knut Reinert, Temesgen Hailemariam Dadi, Marcel Ehrhardt, Hannes Hauswedell, Svenja Mehringer, René Rahn, Jongkyu Kim, Christopher Pockrandt, Jörg Winkler, Enrico Siragusa, Gianvito Urgese, David Weese, “The SeqAn C++ template library for efficient sequence analysis: A resource for programmers”, vol. 261, 2017-11-10.

cite this publication

@article{fu_mi_publications2103,
 abstract = {Background

The use of novel algorithmic techniques is pivotal to many important problems in life science. For example the sequencing of the human genome (Venter et al., 2001) would not have been possible without advanced assembly algorithms and the development of practical BWT based read mappers have been instrumental for NGS analysis. However, owing to the high speed of technological progress and the urgent need for bioinformatics tools, there was a widening gap between state-of-the-art algorithmic techniques and the actual algorithmic components of tools that are in widespread use. We previously addressed this by introducing the SeqAn library of efficient data types and algorithms in 2008 (D{\"o}ring et al., 2008).

Results

The SeqAn library has matured considerably since its first publication 9 years ago. In this article we review its status as an established resource for programmers in the field of sequence analysis and its contributions to many analysis tools.

Conclusions

We anticipate that SeqAn will continue to be a valuable resource, especially since it started to actively support various hardware acceleration techniques in a systematic manner.

Keywords

NGS analysis; Software libraries; C++; Data structures},
 author = {Knut Reinert and Temesgen Hailemariam Dadi and Marcel Ehrhardt and Hannes Hauswedell and Svenja Mehringer and Ren{\'e} Rahn and Jongkyu Kim and Christopher Pockrandt and J{\"o}rg Winkler and Enrico Siragusa and Gianvito Urgese and David Weese},
 journal = {Journal of Biotechnology},
 keywords = {NGS analysis; Software libraries; C++; Data structures},
 month = {November},
 pages = {157--168},
 publisher = {ELSEVIER},
 title = {The SeqAn C++ template library for efficient sequence analysis: A resource for programmers},
 url = {http://publications.imp.fu-berlin.de/2103/},
 volume = {261},
 year = {2017}
}

Publications by year

2023

Svenja Mehringer, Enrico Seiler, Felix Droop, Mitra Darvish, René Rahn, Martin Vingron, Knut Reinert, “Hierarchical Interleaved Bloom Filter: enabling ultrafast, approximate sequence queries”, vol. 24, iss. 131, 2023-05-31.

cite this publication

@article{fu_mi_publications2846,
 abstract = {We present a novel data structure for searching sequences in large databases: the Hierarchical Interleaved Bloom Filter (HIBF). It is extremely fast and space efficient, yet so general that it could serve as the underlying engine for many applications. We show that the HIBF is superior in build time, index size, and search time while achieving a comparable or better accuracy compared to other state-of-the-art tools. The HIBF builds an index up to 211 times faster, using up to 14 times less space, and can answer approximate membership queries faster by a factor of up to 129.
We show that the HIBF is superior in build time, index size and search time while achieving a comparable or better accuracy compared to other state-of-the art tools (Mantis and Bifrost). The HIBF builds an index up to 211 times faster, using up to 14 times less space and can answer approximate membership queries faster by a factor of up to 129. This can be considered a quantum leap that opens the door to indexing complete sequence archives like the European Nucleotide Archive or even larger metagenomics data sets.},
 author = {Svenja Mehringer and Enrico Seiler and Felix Droop and Mitra Darvish and Ren{\'e} Rahn and Martin Vingron and Knut Reinert},
 booktitle = {Hierarchical Interleaved Bloom Filter: Enabling ultrafast, approximate sequence queries},
 journal = {Genome Biology},
 month = {May},
 number = {131},
 publisher = {BioMed Central},
 title = {Hierarchical Interleaved Bloom Filter: enabling ultrafast, approximate sequence queries},
 url = {http://publications.imp.fu-berlin.de/2846/},
 volume = {24},
 year = {2023}
}

Chenxu Pan, René Rahn, David Heller, Knut Reinert, “Linear: a framework to enable existing software to resolve structural variants in long reads with flexible and efficient alignment-free statistical models”, vol. 24, iss. 2, 2023-03-03.

cite this publication

@article{fu_mi_publications2947,
 abstract = {Alignment is the cornerstone of many long-read pipelines and plays an essential role in resolving structural variants (SVs). However, forced alignments of SVs embedded in long reads, inflexibility of integrating novel SVs models and computational inefficiency remain problems. Here, we investigate the feasibility of resolving long-read SVs with alignment-free algorithms. We ask: (1) Is it possible to resolve long-read SVs with alignment-free approaches? and (2) Does it provide an advantage over existing approaches? To this end, we implemented the framework named Linear, which can flexibly integrate alignment-free algorithms such as the generative model for long-read SV detection. Furthermore, Linear addresses the problem of compatibility of alignment-free approaches with existing software. It takes as input long reads and outputs standardized results existing software can directly process. We conducted large-scale assessments in this work and the results show that the sensitivity, and flexibility of Linear outperform alignment-based pipelines. Moreover, the computational efficiency is orders of magnitude faster.},
 author = {Chenxu Pan and Ren{\'e} Rahn and David Heller and Knut Reinert},
 journal = {Briefings in Bioinformatics},
 keywords = {alignment-free approach, graph generative model, structural variants resolution, long-read analysis},
 month = {March},
 number = {2},
 publisher = {Oxford University Press},
 title = {Linear: a framework to enable existing software to resolve structural variants in long reads with flexible and efficient alignment-free statistical models},
 url = {http://publications.imp.fu-berlin.de/2947/},
 volume = {24},
 year = {2023}
}

Yuwei Wang, Bin Lian, Haohui Zhang, Yuanke Zhong, Jie He, Fashuai Wu, Knut Reinert, Xuequn Shang, Hui Yang, Jialu Hu, Anthony Mathelier, “A multi-view latent variable model reveals cellular heterogeneity in complex tissues for paired multimodal single-cell data”, vol. 39, iss. 1, 2023-01-09.

cite this publication

@article{fu_mi_publications2948,
 abstract = {Motivation:
Single-cell multimodal assays allow us to simultaneously measure two different molecular features of the same cell, enabling new insights into cellular heterogeneity, cell development and diseases. However, most existing methods suffer from inaccurate dimensionality reduction for the joint-modality data, hindering their discovery of novel or rare cell subpopulations.

Results:
Here, we present VIMCCA, a computational framework based on variational-assisted multi-view canonical correlation analysis to integrate paired multimodal single-cell data. Our statistical model uses a common latent variable to interpret the common source of variances in two different data modalities. Our approach jointly learns an inference model and two modality-specific non-linear models by leveraging variational inference and deep learning. We perform VIMCCA and compare it with 10 existing state-of-the-art algorithms on four paired multi-modal datasets sequenced by different protocols. Results demonstrate that VIMCCA facilitates integrating various types of joint-modality data, thus leading to more reliable and accurate downstream analysis. VIMCCA improves our ability to identify novel or rare cell subtypes compared to existing widely used methods. Besides, it can also facilitate inferring cell lineage based on joint-modality profiles.

Availability and implementation:
The VIMCCA algorithm has been implemented in our toolkit package scbean (??0.5.0), and its code has been archived at https://github.com/jhu99/scbean under MIT license.},
 author = {Yuwei Wang and Bin Lian and Haohui Zhang and Yuanke Zhong and Jie He and Fashuai Wu and Knut Reinert and Xuequn Shang and Hui Yang and Jialu Hu and Anthony Mathelier},
 journal = {Bioinformatics},
 month = {January},
 number = {1},
 publisher = {Oxford University Press},
 title = {A multi-view latent variable model reveals cellular heterogeneity in complex tissues for paired multimodal single-cell data},
 url = {http://publications.imp.fu-berlin.de/2948/},
 volume = {39},
 year = {2023}
}

Vitor C Piro, Bernhard Y Renard, “Contamination detection and microbiome exploration with GRIMER”, vol. 12, 2023-03-30.

cite this publication

@article{fu_mi_publications2949,
 abstract = {Background:
Contamination detection is a important step that should be carefully considered in early stages when designing and performing microbiome studies to avoid biased outcomes. Detecting and removing true contaminants is challenging, especially in low-biomass samples or in studies lacking proper controls. Interactive visualizations and analysis platforms are crucial to better guide this step, to help to identify and detect noisy patterns that could potentially be contamination. Additionally, external evidence, like aggregation of several contamination detection methods and the use of common contaminants reported in the literature, could help to discover and mitigate contamination.

Results:
We propose GRIMER, a tool that performs automated analyses and generates a portable and interactive dashboard integrating annotation, taxonomy, and metadata. It unifies several sources of evidence to help detect contamination. GRIMER is independent of quantification methods and directly analyzes contingency tables to create an interactive and offline report. Reports can be created in seconds and are accessible for nonspecialists, providing an intuitive set of charts to explore data distribution among observations and samples and its connections with external sources. Further, we compiled and used an extensive list of possible external contaminant taxa and common contaminants with 210 genera and 627 species reported in 22 published articles.

Conclusion:
GRIMER enables visual data exploration and analysis, supporting contamination detection in microbiome studies. The tool and data presented are open source and available at https://gitlab.com/dacs-hpi/grimer.},
 author = {Vitor C Piro and Bernhard Y Renard},
 journal = {GigaScience},
 month = {March},
 publisher = {Oxford University Press},
 title = {Contamination detection and microbiome exploration with GRIMER},
 url = {http://publications.imp.fu-berlin.de/2949/},
 volume = {12},
 year = {2023}
}

2022

F. Meyer, A. Fritz, Z.-L. Deng, D. Koslicki, A. Gurevich, G. Robertson, M. Alser, D. Antipov, F. Beghini, D. Bertrand, J. J. Brito, C.T. Brown, J. Buchmann, A. Buluç, B. Chen, R. Chikhi, P. T. Clausen, A. Cristian, P. W. Dabrowski, A. E. Darling, R. Egan, E. Eskin, E. Georganas, E. Goltsman, M. A. Gray, L. H. Hansen, S. Hofmeyr, P. Huang, L. Irber, H. Jia, T. S. Jørgensen, S. D. Kieser, T. Klemetsen, A. Kola, M. Kolmogorov, A. Korobeynikov, J. Kwan, N. LaPierre, C. Lemaitre, C. Li, A. Limasset, F. Malcher-Miranda, S. Mangul, V. R. Marcelino, C. Marchet, P. Marijon, D. Meleshko, D. R. Mende, A. Milanese, N. Nagarajan, J. Nissen, S. Nurk, L. Oliker, L. Paoli, P. Peterlongo, V. C. Piro, J. S. Porter, S. Rasmussen, E. R. Rees, K. Reinert, B. Renard, E. M. Robertsen, G. L. Rosen, H.-J. Ruscheweyh, V. Sarwal, N. Segata, E. Seiler, L. Shi, F. Sun, S. Sunagawa, S. J. Sørensen, A. Thomas, C. Tong, M. Trajkovski, J. Tremblay, G. Uritskiy, R. Vicedomini, Zi. Wang, Zhe. Wang, Zho. Wang, A. Warren, N. P. Willassen, K. Yelick, R. You, G. Zeller, Z. Zhao, S. Zhu, J. Zhu, R. Garrido-Oter, P. Gastmeier, S. Hacquard, S. Häußler, A. Khaledi, F. Maechler, F. Mesny, S. Radutoiu, P. Schulze-Lefert, N. Smit, T. Strowig, A. Bremges, A. Sczyrba, A. C. McHardy, “Critical Assessment of Metagenome Interpretation - the second round of challenges”, vol. 19, iss. 4, 2022-04-08.

cite this publication

@article{fu_mi_publications2605,
 abstract = {Evaluating metagenomic software is key for optimizing metagenome interpretation and focus of the community-driven initiative for the Critical Assessment of Metagenome Interpretation (CAMI). In its second challenge, CAMI engaged the community to assess their methods on realistic and complex metagenomic datasets with long and short reads, created from {$\sim$}1,700 novel and known microbial genomes, as well as {$\sim$}600 novel plasmids and viruses. Altogether 5,002 results by 76 program versions were analyzed, representing a 22x increase in results.

Substantial improvements were seen in metagenome assembly, some due to using long-read data. The presence of related strains still was challenging for assembly and genome binning, as was assembly quality for the latter. Taxon profilers demonstrated a marked maturation, with taxon profilers and binners excelling at higher bacterial taxonomic ranks, but underperforming for viruses and archaea. Assessment of clinical pathogen detection techniques revealed a need to improve reproducibility. Analysis of program runtimes and memory usage identified highly efficient programs, including some top performers with other metrics. The CAMI II results identify current challenges, but also guide researchers in selecting methods for specific analyses.

Competing Interest Statement:

A.E.D. co-founded Longas Technologies Pty Ltd, a company aimed at development of synthetic long-read sequencing technologies.},
 author = {F. Meyer and A. Fritz and Z.-L. Deng and D. Koslicki and A. Gurevich and G. Robertson and M. Alser and D. Antipov and F. Beghini and D. Bertrand and J. J. Brito and C.T. Brown and J. Buchmann and A. Bulu{\c c} and B. Chen and R. Chikhi and P. T. Clausen and A. Cristian and P. W. Dabrowski and A. E. Darling and R. Egan and E. Eskin and E. Georganas and E. Goltsman and M. A. Gray and L. H. Hansen and S. Hofmeyr and P. Huang and L. Irber and H. Jia and T. S. J{\o}rgensen and S. D. Kieser and T. Klemetsen and A. Kola and M. Kolmogorov and A. Korobeynikov and J. Kwan and N. LaPierre and C. Lemaitre and C. Li and A. Limasset and F. Malcher-Miranda and S. Mangul and V. R. Marcelino and C. Marchet and P. Marijon and D. Meleshko and D. R. Mende and A. Milanese and N. Nagarajan and J. Nissen and S. Nurk and L. Oliker and L. Paoli and P. Peterlongo and V. C. Piro and J. S. Porter and S. Rasmussen and E. R. Rees and K. Reinert and B. Renard and E. M. Robertsen and G. L. Rosen and H.-J. Ruscheweyh and V. Sarwal and N. Segata and E. Seiler and L. Shi and F. Sun and S. Sunagawa and S. J. S{\o}rensen and A. Thomas and C. Tong and M. Trajkovski and J. Tremblay and G. Uritskiy and R. Vicedomini and Zi. Wang and Zhe. Wang and Zho. Wang and A. Warren and N. P. Willassen and K. Yelick and R. You and G. Zeller and Z. Zhao and S. Zhu and J. Zhu and R. Garrido-Oter and P. Gastmeier and S. Hacquard and S. H{\"a}u{\ss}ler and A. Khaledi and F. Maechler and F. Mesny and S. Radutoiu and P. Schulze-Lefert and N. Smit and T. Strowig and A. Bremges and A. Sczyrba and A. C. McHardy},
 booktitle = {Critical Assessment of Metagenome Interpretation - the second round of challenges},
 journal = {Nature Methods},
 month = {April},
 number = {4},
 pages = {429--440},
 publisher = {Nature Publishing Group},
 title = {Critical Assessment of Metagenome Interpretation - the second round of challenges},
 url = {http://publications.imp.fu-berlin.de/2605/},
 volume = {19},
 year = {2022}
}

Jörg Winkler, Gianvito Urgese, Elisa Ficarra, Knut Reinert, “LaRA 2: parallel and vectorized program for sequence–structure alignment of RNA sequences”, vol. 23, iss. 1, 2022-01-06.

cite this publication

@article{fu_mi_publications2695,
 abstract = {Background

The function of non-coding RNA sequences is largely determined by their spatial conformation, namely the secondary structure of the molecule, formed by Watson?Crick interactions between nucleotides. Hence, modern RNA alignment algorithms routinely take structural information into account. In order to discover yet unknown RNA families and infer their possible functions, the structural alignment of RNAs is an essential task. This task demands a lot of computational resources, especially for aligning many long sequences, and it therefore requires efficient algorithms that utilize modern hardware when available. A subset of the secondary structures contains overlapping interactions (called pseudoknots), which add additional complexity to the problem and are often ignored in available software.

Results

We present the SeqAn-based software LaRA 2 that is significantly faster than comparable software for accurate pairwise and multiple alignments of structured RNA sequences. In contrast to other programs our approach can handle arbitrary pseudoknots. As an improved re-implementation of the LaRA tool for structural alignments, LaRA 2 uses multi-threading and vectorization for parallel execution and a new heuristic for computing a lower boundary of the solution. Our algorithmic improvements yield a program that is up to 130 times faster than the previous version.

Conclusions

With LaRA 2 we provide a tool to analyse large sets of RNA secondary structures in relatively short time, based on structural alignment. The produced alignments can be used to derive structural motifs for the search in genomic databases.},
 author = {J{\"o}rg Winkler and Gianvito Urgese and Elisa Ficarra and Knut Reinert},
 journal = {BMC Bioinformatics},
 month = {January},
 number = {1},
 publisher = {Springer Nature},
 title = {LaRA 2: parallel and vectorized program for sequence?structure alignment of RNA sequences},
 url = {http://publications.imp.fu-berlin.de/2695/},
 volume = {23},
 year = {2022}
}

Marius Knaust, Enrico Seiler, Knut Reinert, Thomas Steinke, “Co-Design for Energy Efficient and Fast Genomic Search: Interleaved Bloom Filter on FPGA”, 2022-02.

cite this publication

@inproceedings{fu_mi_publications2809,
 abstract = {Next-Generation Sequencing technologies generate a vast and exponentially increasing amount of sequence data. The Interleaved Bloom Filter (IBF) is a novel indexing data structure which is state-of-the-art for distributing approximate queries with an in-memory data structure. With it, a main task of sequence analysis pipelines, (approximately) searching large reference data sets for sequencing reads or short sequence patterns like genes, can be significantly accelerated. To meet performance and energy-efficiency requirements, we chose a co-design approach of the IBF data structure on the FPGA platform. Further, our OpenCL-based implementation allows a seamless integration into the widely used SeqAn C++ library for biological sequence analysis. Our algorithmic design and optimization strategy takes advantage of FPGA-specific features like shift register and the parallelization potential of many bitwise operations. We designed a well-chosen schema to partition data across the different memory domains on the FPGA platform using the Shared Virtual Memory concept. We can demonstrate significant improvements in energy efficiency of up to 19 times and in performance of up to 5.6 times, respectively, compared to a well-tuned, multithreaded CPU reference.},
 author = {Marius Knaust and Enrico Seiler and Knut Reinert and Thomas Steinke},
 booktitle = {Proceedings of the 2022 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays},
 month = {February},
 note = {conference website: https://www.isfpga.org/},
 pages = {180--189},
 title = {Co-Design for Energy Efficient and Fast Genomic Search: Interleaved Bloom Filter on FPGA},
 url = {http://publications.imp.fu-berlin.de/2809/},
 year = {2022}
}

Jafar Gholami, Mohamad Reza Kamankesh, Somayeh Mohammadi, Elahe Hosseinkhani, Somayeh Abdi, “Powerful enhanced Jaya algorithm for efficiently optimizing numerical and engineering problems”, 2022-03-14.

cite this publication

@article{fu_mi_publications2810,
 abstract = {Over the last decade, the size and complexity of real-world problems have grown dramatically, necessitating more effective tools. Nature-inspired metaheuristic algorithms have proven to be a promising tool for solving such problems due to their performance in a variety of fields. JAYA algorithm is a novel population-based algorithm which could have been able to present reliable results. This is because it does not need any parameters to be set other than the population size and the maximum number of iteration. Despite its positive feedbacks, this algorithm should be modified to witness more efficiency. This paper aims to amend the original version of Jaya to present a high-efficiency version named Powerful Enhanced Jaya (PEJAYA). In other words, the methodology of updating position in Jaya is modified to enhance the convergence and search capabilities. This approach is assessed according to solve 20 well-known benchmark functions, feature selection, and statistical tests. The output results of proposed optimization algorithm are then evaluated by comparing it with other recent algorithms including crow search algorithm (CSA), standard version of JAYA, particle swarm optimization (PSO), dragonfly algorithm (DA), grasshopper optimization algorithm (GOA), moth-flame optimization (MFO) and sine?cosine algorithm (SCA). Solving a real-world problem is another way of checking the efficiency of this approach with other published works. Prompt escape from local minima, superior convergence, and stability demonstrate that the suggested approach is a very powerful instrument that may be employed in a variety of optimization situations.},
 author = {Jafar Gholami and Mohamad Reza Kamankesh and Somayeh Mohammadi and Elahe Hosseinkhani and Somayeh Abdi},
 journal = {Soft Computing},
 month = {March},
 title = {Powerful enhanced Jaya algorithm for efficiently optimizing numerical and engineering problems},
 url = {http://publications.imp.fu-berlin.de/2810/},
 year = {2022}
}

Mitra Darvish, Enrico Seiler, Svenja Mehringer, René Rahn, Knut Reinert, Yann Ponty, “Needle: a fast and space-efficient prefilter for estimating the quantification of very large collections of expression experiments”, vol. 38, iss. 17, 2022-07-08.

cite this publication

@article{fu_mi_publications2845,
 abstract = {Motivation

The ever-growing size of sequencing data is a major bottleneck in bioinformatics as the advances of hardware development cannot keep up with the data growth. Therefore, an enormous amount of data is collected but rarely ever reused, because it is nearly impossible to find meaningful experiments in the stream of raw data.

Results

As a solution, we propose Needle, a fast and space-efficient index which can be built for thousands of experiments in {\ensuremath{<}}2 h and can estimate the quantification of a transcript in these experiments in seconds, thereby outperforming its competitors. The basic idea of the Needle index is to create multiple interleaved Bloom filters that each store a set of representative k-mers depending on their multiplicity in the raw data. This is then used to quantify the query.
Availability and implementation

https://github.com/seqan/needle.},
 author = {Mitra Darvish and Enrico Seiler and Svenja Mehringer and Ren{\'e} Rahn and Knut Reinert and Yann Ponty},
 journal = {Bioinformatics},
 month = {July},
 number = {17},
 pages = {4100--4108},
 title = {Needle: a fast and space-efficient prefilter for estimating the quantification of very large collections of expression experiments},
 url = {http://publications.imp.fu-berlin.de/2845/},
 volume = {38},
 year = {2022}
}

Thomas Krannich, “Contributions to the detection of non-reference sequences in population-scale NGS data”, 2022-06-02.

cite this publication

@phdthesis{fu_mi_publications2854,
 abstract = {Non-reference sequence (NRS) variants are a less frequently investigated class of genomic structural variants (SV). Here, DNA sequences are found within an individual that are novel with respect to a given reference. NRS occur predominantly due to the fact that a linear reference genome lacks biological diversity and ancestral sequence if it was primarily derived from a single or few individuals. Therefore, newly sequenced individuals can yield genomic sequences which are absent from a reference genome. With the increasing throughput of sequencing technologies, SV detection has become possible across tens of thousands of individuals. When using short-read data, the detection of NRS variants inevitably involves a de novo assembly which is a complex computational problem and requires high-quality sequence data at high coverage. Previous studies have demonstrated how sequence data of multiple genomes can be combined for the reliable detection of NRS variants. However, the algorithms proposed in these studies have a limited capability to process large sets of genomes. This thesis introduces novel contributions for the discovery of NRS variants in many genomes, which scale to considerably larger numbers of genomes than previous methods. A practical software tool, PopIns2, that was developed to apply the presented methods is elucidated in greater detail. The highlight among the new contributions is a procedure to merge contig assemblies of unaligned reads from many individuals into a single set of NRS by heuristically generating a weighted minimum path cover for a colored de Bruijn graph. Tests on simulated data show that PopIns2 ranks among the best approaches in terms of quality and reliability and that its approach yields the best precision for a growing number of genomes processed. Results on the Polaris Diversity Cohort and a set of 1000 Icelandic human genomes demonstrate unmatched scalability for the application on population-scale datasets.},
 author = {Thomas Krannich},
 month = {June},
 school = {Freie Universit{\"a}t Berlin},
 title = {Contributions to the detection of non-reference sequences in population-scale NGS data},
 url = {http://publications.imp.fu-berlin.de/2854/},
 year = {2022}
}

Sebastian Niehus, “Multi-Sample Approaches and Applications for Structural Variant Detection”, 2022-07-28.

cite this publication

@phdthesis{fu_mi_publications2855,
 abstract = {In recent years, advances in the field of sequencing technologies have enabled the field of population-scale sequencing studies. These studies aim to sequence and analyze a large set of individuals from one or multiple populations, with the aim of gaining insight into underlying genetic structure, similarities and differences. Collections of genetic variation and possible connections to various disease are some of the products of this area of research. The potential of population studies is widely considered to be huge and many more endeavors of this kind are expected in the near future. This opportunity comes with a big challenge because many computational tools that are used for the analysis of sequencing data were not designed for cohorts of this size and may suffer from limited scalability. It is therefore vital that the computational tools required for the analysis of population-scale data keep up with the quickly growing amounts of data.

This thesis contributes to the field of population-scale genetics in the development and application of a novel approach for structural variant detection. It has explicitly been designed with the large amounts of population-scale sequencing data in mind. The presented approach is capable of analyzing tens of thousands of whole-genome short-read sequencing samples jointly. This joint analysis is driven by a tailored joint likelihood ratio model that integrates information from many genomes. The efficient approach does not only save computational resources but also allows to combine the data across all samples to make sensitive and specific predictions about the presence and genotypes of structural variation present within the analyzed population. This thesis demonstrates that this approach and the computational tool PopDel that implements it compare favorably to current state-of-the-art structural variant callers that have been used in previous population-scale studies. Extensive benchmarks on simulated and real world sequencing data are provided to show the performance of the presented approach. Further, a first finding of medical relevance that directly stems from the application of PopDel on the genomes of almost 50,000 Icelanders is presented.

This thesis therefore provides a novel tool and new ideas to further push the boundaries of the analysis of massive amounts of next generation sequencing data and to deepen our understanding of structural variation and their implications for human health.},
 author = {Sebastian Niehus},
 month = {July},
 school = {Freie Universit{\"a}t Berlin},
 title = {Multi-Sample Approaches and Applications for Structural Variant Detection},
 url = {http://publications.imp.fu-berlin.de/2855/},
 year = {2022}
}

Yannek Nowatzky, Philipp Benner, Knut Reinert, Thilo Muth, “Mistle: bringing spectral library predictions to metaproteomics with an efficient search index”, 2022-09-12.

cite this publication

@article{fu_mi_publications2946,
 abstract = {Motivation: 
Deep learning has moved to the forefront of tandem mass spectrometry-driven proteomics and authentic prediction for peptide fragmentation is more feasible than ever. Still, at this point spectral prediction is mainly used to validate database search results or used for confined search spaces. Fully predicted spectral libraries have not yet been efficiently adapted to large search space problems that often occur in metaproteomics or proteogenomics.

Results: 
In this study, we showcase a workflow that uses Prosit for spectral library predictions on two common metaproteomes and implement an indexing and search algorithm, Mistle, to efficiently identify experimental mass spectra within the library. Hence, the workflow emulates a classic protein sequence database search with protein digestion but builds a searchable index from spectral predictions as an in-between step. We compare Mistle to popular search engines, both on a spectral and database search level, and provide evidence that this approach is more accurate than a database search using MSFragger. Mistle outperforms other spectral library search engines in terms of run time and proves to be extremely memory efficient with an 8 to 22-fold decrease in RAM usage. This makes Mistle universally applicable to large search spaces, e.g. covering comprehensive sequence databases of diverse microbiomes.

Availability: 
Mistle is freely available on GitHub at https://github.com/BAMeScience/Mistle.},
 author = {Yannek Nowatzky and Philipp Benner and Knut Reinert and Thilo Muth},
 booktitle = {Mistle: bringing spectral library predictions to metaproteomics with an efficient search index},
 journal = {bioRxiv},
 month = {September},
 title = {Mistle: bringing spectral library predictions to metaproteomics with an efficient search index},
 url = {http://publications.imp.fu-berlin.de/2946/},
 year = {2022}
}

2021

Silver A Wolf, Lennard Epping, Sandro Andreotti, Knut Reinert, Torsten Semmler, Inanc Birol, “SCORE: Smart Consensus Of RNA Expression—a consensus tool for detecting differentially expressed genes in bacteria”, vol. 37, iss. 3, 2021-02-01.

cite this publication

@article{fu_mi_publications2513,
 abstract = {RNA-sequencing (RNA-Seq) is the current method of choice for studying bacterial transcriptomes. To date, many computational pipelines have been developed to predict differentially expressed genes from RNA-Seq data, but no gold-standard has been widely accepted. We present the Snakemake-based tool Smart Consensus Of RNA Expression (SCORE) which uses a consensus approach founded on a selection of well-established tools for differential gene expression analysis. This allows SCORE to increase the overall prediction accuracy and to merge varying results into a single, human-readable output. SCORE performs all steps for the analysis of bacterial RNA-Seq data, from read preprocessing to the overrepresentation analysis of significantly associated ontologies. Development of consensus approaches like SCORE will help to streamline future RNA-Seq workflows and will fundamentally contribute to the creation of new gold-standards for the analysis of these types of data.
Availability and implementation: 
https://github.com/SiWolf/SCORE.},
 author = {Silver A Wolf and Lennard Epping and Sandro Andreotti and Knut Reinert and Torsten Semmler and Inanc Birol},
 journal = {Bioinformatics},
 month = {February},
 number = {3},
 pages = {426--428},
 publisher = {Oxford University Press, Oxford Academic},
 title = {SCORE: Smart Consensus Of RNA Expression{--}a consensus tool for detecting differentially expressed genes in bacteria},
 url = {http://publications.imp.fu-berlin.de/2513/},
 volume = {37},
 year = {2021}
}

Enrico Seiler, Svenja Mehringer, Mitra Darvish, Etienne Turc, Knut Reinert, “Raptor: A fast and space-efficient pre-filter for querying very large collections of nucleotide sequences”, vol. 24, iss. 7, 2021-07-23.

cite this publication

@article{fu_mi_publications2519,
 abstract = {We present Raptor, a tool for approximately searching many queries in large collections of nucleotide sequences. In comparison with similar tools like Mantis and COBS, Raptor is 12-144 times faster and uses up to 30 times less memory. Raptor uses winnowing minimizers to define a set of representative k-mers, an extension of the Interleaved Bloom Filters (IBF) as a set membership data structure, and probabilistic thresholding for minimizers. Our approach allows compression and a partitioning of the IBF to enable the effective use of secondary memory.
Competing Interest Statement: The authors have declared no competing interest.},
 author = {Enrico Seiler and Svenja Mehringer and Mitra Darvish and Etienne Turc and Knut Reinert},
 booktitle = {Raptor: A fast and space-efficient pre-filter for querying very large collections of nucleotide sequences},
 journal = {iScience},
 month = {July},
 number = {7},
 pages = {102782},
 publisher = {Elsevier},
 title = {Raptor: A fast and space-efficient pre-filter for querying very large collections of nucleotide sequences},
 url = {http://publications.imp.fu-berlin.de/2519/},
 volume = {24},
 year = {2021}
}

Marie Hoffmann, Michael T. Monaghan, Knut Reinert, “PriSeT: Efficient De Novo Primer Discovery”, 2021-08-01.

cite this publication

@article{fu_mi_publications2520,
 abstract = {Motivation: 
DNA metabarcoding is a commonly applied technique used to infer the species composition of environmental samples. These samples can comprise hundreds of organisms that can be closely or very distantly related in the taxonomic tree of life. DNA metabarcoding combines polymerase chain reaction (PCR) and next-generation sequencing (NGS), whereby a short, homologous sequence of DNA is amplified and sequenced from all members of the community. Sequences are then taxonomically identified based on their match to a reference database. Ideally, each species of interest would have a unique DNA barcode. This short, variable sequence needs to be flanked by relatively conserved regions that can be used as primer binding sites. Appropriate PCR primer pairs would match to a broad evolutionary range of taxa, such that we only need a few to achieve high taxonomic coverage. At the same time however, the DNA barcodes between primer pairs should be different to allow us to distinguish between species to improve resolution. This poses an interesting optimization problem. More specifically: Given a set of references ? = \{R1, R2, ?, Rm\}, the problem is to find a primer set P balancing both: high taxonomic coverage and high resolution. This goal can be captured by filtering for frequent primers and ranking by coverage or variation, i.e. the number of unique barcodes. Here we present the software PriSeT, an offline primer discovery tool that is capable of processing large libraries and is robust against mislabeled or low quality references. It tackles the computationally expensive steps with linear runtime filters and efficient encodings.

Results: 
We first evaluated PriSeT on references (mostly 18S rRNA genes) from 19 clades covering eukaryotic organisms that are typical for freshwater plankton samples. PriSeT recovered several published primer sets as well as additional, more chemically suitable primer sets. For these new sets, we compared frequency, taxon coverage, and amplicon variation with published primer sets. For 11 clades we found de novo primer pairs that cover more taxa than the published ones, and for six clades de novo primers resulted in greater sequence (i.e., DNA barcode) variation. We also applied PriSeT to 19 SARS-CoV-2 genomes and computed 114 new primer pairs with the additional constraint that the sequences have no co-occurrences in other taxa. These primer sets would be suitable for empirical testing.

Availability: https://github.com/mariehoffmann/PriSeT},
 author = {Marie Hoffmann and Michael T. Monaghan and Knut Reinert},
 booktitle = {PriSeT: Efficient De Novo Primer Discovery},
 journal = {Proceedings of the 12th ACM Conference on Bioinformatics, Computational Biology, and Health Informatics},
 month = {August},
 pages = {1--12},
 publisher = {ACM},
 title = {PriSeT: Efficient De Novo Primer Discovery},
 url = {http://publications.imp.fu-berlin.de/2520/},
 year = {2021}
}

Sandeep Keshavan, Fernando Torres Andón, Audrey Gallud, Wei Chen, Knut Reinert, Lang Tran, Bengt Fadeel, “Profiling of Sub-Lethal in Vitro Effects of Multi-Walled Carbon Nanotubes Reveals Changes in Chemokines and Chemokine Receptors”, vol. 11, iss. 4, 2021-03-30.

cite this publication

@article{fu_mi_publications2575,
 abstract = {Engineered nanomaterials are potentially very useful for a variety of applications, but studies are needed to ascertain whether these materials pose a risk to human health. Here, we studied three benchmark nanomaterials (Ag nanoparticles, TiO2 nanoparticles, and multi-walled carbon nanotubes, MWCNTs) procured from the nanomaterial repository at the Joint Research Centre of the European Commission. Having established a sub-lethal concentration of these materials using two human cell lines representative of the immune system and the lungs, respectively, we performed RNA sequencing of the macrophage-like cell line after exposure for 6, 12, and 24 h. Downstream analysis of the transcriptomics data revealed significant effects on chemokine signaling pathways. CCR2 was identified as the most significantly upregulated gene in MWCNT-exposed cells. Using multiplex assays to evaluate cytokine and chemokine secretion, we could show significant effects of MWCNTs on several chemokines, including CCL2, a ligand of CCR2. The results demonstrate the importance of evaluating sub-lethal concentrations of nanomaterials in relevant target cells.},
 author = {Sandeep Keshavan and Fernando Torres And{\'o}n and Audrey Gallud and Wei Chen and Knut Reinert and Lang Tran and Bengt Fadeel},
 journal = {Nanomaterials},
 keywords = {multi-walled carbon nanotubes; nanoparticles; chemokines; macrophages; transcriptomics},
 month = {March},
 number = {4},
 pages = {883},
 publisher = {MDPI},
 title = {Profiling of Sub-Lethal in Vitro Effects of Multi-Walled Carbon Nanotubes Reveals Changes in Chemokines and Chemokine Receptors},
 url = {http://publications.imp.fu-berlin.de/2575/},
 volume = {11},
 year = {2021}
}

Kerstin Neubert, Eric Zuchantke, Robert Maximilian Leidenfrost, Röbbe Wünschiers, Josephine Grützke, Burkhard Malorny, Holger Brendebach, Sascha Al Dahouk, Timo Homeier, Helmut Hotzel, Knut Reinert, Herbert Tomaso, Anne Busch, “Testing assembly strategies of Francisella tularensis genomes to infer an evolutionary conservation analysis of genomic structures”, vol. 22, iss. 1, 2021-11-14.

cite this publication

@article{fu_mi_publications2696,
 abstract = {Background

We benchmarked sequencing technology and assembly strategies for short-read, long-read, and hybrid assemblers in respect to correctness, contiguity, and completeness of assemblies in genomes of Francisella tularensis. Benchmarking allowed in-depth analyses of genomic structures of the Francisella pathogenicity islands and insertion sequences. Five major high-throughput sequencing technologies were applied, including next-generation ?short-read? and third-generation ?long-read? sequencing methods.

Results

We focused on short-read assemblers, hybrid assemblers, and analysis of the genomic structure with particular emphasis on insertion sequences and the Francisella pathogenicity island. The A5-miseq pipeline performed best for MiSeq data, Mira for Ion Torrent data, and ABySS for HiSeq data from eight short-read assembly methods. Two approaches were applied to benchmark long-read and hybrid assembly strategies: long-read-first assembly followed by correction with short reads (Canu/Pilon, Flye/Pilon) and short-read-first assembly along with scaffolding based on long reads (Unicyler, SPAdes). Hybrid assembly can resolve large repetitive regions best with a ?long-read first? approach.

Conclusions

Genomic structures of the Francisella pathogenicity islands frequently showed misassembly. Insertion sequences (IS) could be used to perform an evolutionary conservation analysis. A phylogenetic structure of insertion sequences and the evolution within the clades elucidated the clade structure of the highly conservative F. tularensis.},
 author = {Kerstin Neubert and Eric Zuchantke and Robert Maximilian Leidenfrost and R{\"o}bbe W{\"u}nschiers and Josephine Gr{\"u}tzke and Burkhard Malorny and Holger Brendebach and Sascha Al Dahouk and Timo Homeier and Helmut Hotzel and Knut Reinert and Herbert Tomaso and Anne Busch},
 journal = {BMC Genomics},
 month = {November},
 number = {1},
 pages = {822},
 publisher = {Springer Nature},
 title = {Testing assembly strategies of Francisella tularensis genomes to infer an evolutionary conservation analysis of genomic structures},
 url = {http://publications.imp.fu-berlin.de/2696/},
 volume = {22},
 year = {2021}
}

Ulf Leser, Marcus Hilbrich, Claudia Draxl, Peter Eisert, Lars Grunske, Patrick Hostert, Dagmar Kainmüller, Odej Kao, Birte Kehr, Timo Kehrer, Christoph Koch, Volker Markl, Henning Meyerhenke, Tilmann Rabl, Alexander Reinefeld, Knut Reinert, Kerstin Ritter, Björn Scheuermann, Florian Schintke, Nicole Schweikardt, Matthias Weidlich, “The Collaborative Research Center FONDA”, vol. 21, iss. 3, 2021-11-12.

cite this publication

@article{fu_mi_publications2697,
 abstract = {Today?s scientific data analysis very often requires complex Data Analysis Workflows (DAWs) executed over distributed computational infrastructures, e.g., clusters. Much research effort is devoted to the tuning and performance optimization of specific workflows for specific clusters. However, an arguably even more important problem for accelerating research is the reduction of development, adaptation, and maintenance times of DAWs. We describe the design and setup of the Collaborative Research Center (CRC) 1404 ?FONDA -? Foundations of Workflows for Large-Scale Scientific Data Analysis?, in which roughly 50 researchers jointly investigate new technologies, algorithms, and models to increase the portability, adaptability, and dependability of DAWs executed over distributed infrastructures. We describe the motivation behind our project, explain its underlying core concepts, introduce FONDA?s internal structure, and sketch our vision for the future of workflow-based scientific data analysis. We also describe some lessons learned during the ?making of? a CRC in Computer Science with strong interdisciplinary components, with the aim to foster similar endeavors.},
 author = {Ulf Leser and Marcus Hilbrich and Claudia Draxl and Peter Eisert and Lars Grunske and Patrick Hostert and Dagmar Kainm{\"u}ller and Odej Kao and Birte Kehr and Timo Kehrer and Christoph Koch and Volker Markl and Henning Meyerhenke and Tilmann Rabl and Alexander Reinefeld and Knut Reinert and Kerstin Ritter and Bj{\"o}rn Scheuermann and Florian Schintke and Nicole Schweikardt and Matthias Weidlich},
 journal = {Datenbank-Spektrum},
 month = {November},
 number = {3},
 pages = {255--260},
 publisher = {Springer Nature},
 title = {The Collaborative Research Center FONDA},
 url = {http://publications.imp.fu-berlin.de/2697/},
 volume = {21},
 year = {2021}
}

Sara Hetzel, Pay Giesselmann, Knut Reinert, Alexander Meissner, Helene Kretzmer, Can Alkan, “RLM: fast and simplified extraction of read-level methylation metrics from bisulfite sequencing data”, vol. 37, iss. 21, 2021-11-01.

cite this publication

@article{fu_mi_publications2698,
 abstract = {Bisulfite sequencing data provide value beyond the straightforward methylation assessment by analyzing single-read patterns. Over the past years, various metrics have been established to explore this layer of information. However, limited compatibility with alignment tools, reference genomes or the measurements they provide present a bottleneck for most groups to routinely perform read-level analysis. To address this, we developed RLM, a fast and scalable tool for the computation of several frequently used read-level methylation statistics. RLM supports standard alignment tools, works independently of the reference genome and handles most sequencing experiment designs. RLM can process large input files with a billion reads in just a few hours on common workstations.

Availability and implementation: 
https://github.com/sarahet/RLM},
 author = {Sara Hetzel and Pay Giesselmann and Knut Reinert and Alexander Meissner and Helene Kretzmer and Can Alkan},
 journal = {Bioinformatics},
 month = {November},
 number = {21},
 pages = {3934--3935},
 publisher = {Oxford Univ. Press},
 title = {RLM: fast and simplified extraction of read-level methylation metrics from bisulfite sequencing data},
 url = {http://publications.imp.fu-berlin.de/2698/},
 volume = {37},
 year = {2021}
}

Hannes Peer Hauswedell, “SeqAn3 – Sequence Analysis and Modern C++”, 2021-05-28.

cite this publication

@phdthesis{fu_mi_publications2850,
 abstract = {This thesis introduces SeqAn3, a new software library built with Modern C++ to solve problems from the domain of sequence analysis in bioinformatics. It discusses previous versions of the library in detail and explains the importance of highly performing programming languages like C++. Complexity in the design of the library and of the programming language itself are identified as the major obstacles to user satisfaction, widespread adoption and long-term viability of the project. Therefore, based on very fundamental changes in the C++ programming language, a new library design is formulated and implemented. Its impact is showcased by porting the local aligner called Lambda from SeqAn2 to SeqAn3. Both, the library and the application are highly relevant in practice and prove that simpler and more compact solutions are possible. This thesis documents the process of creating said software, contributing vital information to the fields of research software engineering, library design and to a certain degree also applied programming language research. As one of the first larger projects to be designed fully around C++20 features, it has instructive value beyond bioinformatics.},
 author = {Hannes Peer Hauswedell},
 month = {May},
 school = {Freie Universit{\"a}t Berlin},
 title = {SeqAn3 ? Sequence Analysis and Modern C++},
 url = {http://publications.imp.fu-berlin.de/2850/},
 year = {2021}
}

Pay Giesselmann, “Genome Analysis Methods using Long Read Nanopore Sequencing”, 2021-12-15.

cite this publication

@phdthesis{fu_mi_publications2852,
 abstract = {Third-generation long-read technologies denote the latest progression in high throughput DNA and RNA sequence analysis. Complementing the widespread second-generation short-read platforms, long-read sequencing adds unique application opportunities by generating previously unattainable read lengths. Despite the remaining higher error rate compared to short reads, single-molecule real-time sequencing (SMRT) and nanopore sequencing advanced to be state-of-the-art for {$\backslash$}textit\{de-novo\} genome assemblies and identification of structural variants. Continuous throughput and accuracy improvements lead to development of novel methods and applications at a fast pace. We identify major application fields and key bioinformatic software for long-read sequencing data analysis by employing a data driven literature research. The integration of citations and keywords into a literature graph provides a scaling approach to analyze an exponentially growing number of third-generation sequencing related publications. Even though sparking the development of countless bioinformatics software, the streamlined nanopore data processing into standardized formats is still lacking. As an enabling step for its successful application, we developed {$\backslash$}textit\{Nanopype\}, a modular and scalable pipeline. Our approach facilitates the basic steps of basecalling, alignment, methylation- and structural variant detection with exchangeable tools in each module. Optimized for the usage on high performance compute clusters, we propose a raw data management, capable of handling multiple sequencing devices placed locally and remotely. Strict version control of integrated tools and deployment as containerized software, ensure reproducibility across projects and laboratories. Finally, we analyze disease associated repeat regions utilizing targeted nanopore sequencing and the {$\backslash$}textit\{Nanopype\} processing infrastructure. The expansion of unstable genomic short tandem repeats (STRs) is of particular interest as it causes more than 30 Mendelian human disorders. Long stretches of repetitive sequence render these regions inaccessible for short-read sequencing by synthesis. Furthermore, finding current nanopore basecalling algorithms insufficient to resolve the repeat length, we developed {$\backslash$}textit\{STRique\}, a raw nanopore signal based repeat detection and quantification software. We demonstrate the precise analysis of repeat lengths on patient-derived samples containing C9orf72 and FMR1 repeat expansions. The additional integration of repeat- and nearby promoter-methylation levels reveal a repeat length depending gain, suggesting an epigenetic response to the expansion. Taken together, this work contributes to further increase the usability and provides novel insights based on third-generation nanopore sequencing.},
 author = {Pay Giesselmann},
 month = {December},
 school = {Freie Universit{\"a}t Berlin},
 title = {Genome Analysis Methods using Long Read Nanopore Sequencing},
 url = {http://publications.imp.fu-berlin.de/2852/},
 year = {2021}
}

2020

Julianus Pfeuffer, Timo Sachsenberg, Tjeerd M. H. Dijkstra, Oliver Serang, Knut Reinert, Oliver Kohlbacher, “EPIFANY: A Method for Efficient High-Confidence Protein Inference”, vol. 19, iss. 3, 2020-01-24.

cite this publication

@article{fu_mi_publications2429,
 abstract = {Accurate protein inference in the presence of shared peptides is still one of the key problems in bottom-up proteomics. Most protein inference tools employing simple heuristic inference strategies are efficient but exhibit reduced accuracy. More advanced probabilistic methods often exhibit better inference quality but tend to be too slow for large data sets. Here, we present a novel protein inference method, EPIFANY, combining a loopy belief propagation algorithm with convolution trees for efficient processing of Bayesian networks. We demonstrate that EPIFANY combines the reliable protein inference of Bayesian methods with significantly shorter runtimes. On the 2016 iPRG protein inference benchmark data, EPIFANY is the only tested method that finds all true-positive proteins at a 5\% protein false discovery rate (FDR) without strict prefiltering on the peptide-spectrum match (PSM) level, yielding an increase in identification performance (+10\% in the number of true positives and +14\% in partial AUC) compared to previous approaches. Even very large data sets with hundreds of thousands of spectra (which are intractable with other Bayesian and some non-Bayesian tools) can be processed with EPIFANY within minutes. The increased inference quality including shared peptides results in better protein inference results and thus increased robustness of the biological hypotheses generated. EPIFANY is available as open-source software for all major platforms at https://OpenMS.de/epifany.},
 author = {Julianus Pfeuffer and Timo Sachsenberg and Tjeerd M. H. Dijkstra and Oliver Serang and Knut Reinert and Oliver Kohlbacher},
 journal = {Journal of Proteome Research},
 keywords = {bottom-up proteomics, protein inference, Bayesian networks, convolution trees, loopy belief propagation, iPRG2016},
 month = {January},
 number = {3},
 pages = {1060--1072},
 publisher = {ACS Publications},
 title = {EPIFANY: A Method for Efficient High-Confidence Protein Inference},
 url = {http://publications.imp.fu-berlin.de/2429/},
 volume = {19},
 year = {2020}
}

Christopher Schiefer, Marc Bux, Joergen Brandt, Clemens Messerschmidt, Knut Reinert, Dieter Beule, Ulf Leser, “Portability of Scientific Workflows in NGS Data Analysis: A Case Study”, 2020-06-04.

cite this publication

@techreport{fu_mi_publications2510,
 abstract = {The analysis of next-generation sequencing (NGS) data requires complex computational workflows consisting of dozens of autonomously developed yet interdependent processing steps. Whenever large amounts of data need to be processed, these workflows must be executed on a parallel and/or distributed systems to ensure reasonable runtime. Porting a workflow developed for a particular system on a particular hardware infrastructure to another system or to another infrastructure is non-trivial, which poses a major impediment to the scientific necessities of workflow reproducibility and workflow reusability. In this work, we describe our efforts to port a state-of-the-art workflow for the detection of specific variants in whole-exome sequencing of mice. The workflow originally was developed in the scientific workflow system snakemake for execution on a high-performance cluster controlled by Sun Grid Engine. In the project, we ported it to the scientific workflow system SaasFee that can execute workflows on (multi-core) stand-alone servers or on clusters of arbitrary sizes using the Hadoop. The purpose of this port was that also owners of low-cost hardware infrastructures, for which Hadoop was made for, become able to use the workflow. Although both the source and the target system are called scientific workflow systems, they differ in numerous aspects, ranging from the workflow languages to the scheduling mechanisms and the file access interfaces. These differences resulted in various problems, some expected and more unexpected, that had to be resolved before the workflow could be run with equal semantics. As a side-effect, we also report cost/runtime ratios for a state-of-the-art NGS workflow on very different hardware platforms: A comparably cheap stand-alone server (80 threads), a mid-cost, mid-sized cluster (552 threads), and a high-end HPC system (3784 threads).},
 author = {Christopher Schiefer and Marc Bux and Joergen Brandt and Clemens Messerschmidt and Knut Reinert and Dieter Beule and Ulf Leser},
 month = {June},
 publisher = {arXiv.org},
 title = {Portability of Scientific Workflows in NGS Data Analysis: A Case Study},
 type = {Other},
 url = {http://publications.imp.fu-berlin.de/2510/},
 year = {2020}
}

J.J. Kim, J. Kim, K. Reinert, “Vaquita-LR: A new bioinformatics tool for identifying structural variants using long and short reads. In: Abstracts from the 53rd European Society of Human Genetics (ESHG) Conference: Interactive e-Posters”, vol. 28, iss. S1, 2020-12-01.

cite this publication

@inproceedings{fu_mi_publications2511,
 abstract = {Introduction: 
The identification of structural variation in the genome is difficult due to the lack of a ?perfect? sequencing technology. By combining short and long read sequencing data, Vaquita-LR is a novel bioinformatics approach to identify more variants with higher accuracy by utilizing the strengths of one method to overcome the weaknesses of the other.

Materials and Methods: 
Vaquita-LR is an extension of Vaquita, a short read variant caller, which has been modified to identify potential variants in long reads in addition to short reads and merge them to provide a final set of variants. Vaquita-LR takes into consideration the read depth, read length, and other characteristics of the sequencing data to add weight to more confident calls. Additionally, it adapts techniques from Pilon to improve the accuracy of the long read data using relevant short read information.

Results: 
By combining short and long reads, Vaquita-LR is able to outperform other structural variant callers, including meta callers which combine results from other callers. Importantly, supplying long reads is effective at sequencing depths as shallow as 1x. Using this combination allows Vaquita-LR to better filter out false positives while retaining true positives, providing a better list of possible causal variants for further investigation.

Conclusions: 
With the abundance of data available today, it is important to consider how the data can be effectively merged to better serve our needs. Vaquita-LR is an initial step in showing the usefulness of integrating different sequencing data types when identifying structural variants.},
 author = {J.J. Kim and J. Kim and K. Reinert},
 booktitle = {53rd European Society of Human Genetics (ESHG) Conference},
 journal = {European Journal of Human Genetics},
 month = {December},
 number = {S1},
 pages = {141--797},
 title = {Vaquita-LR: A new bioinformatics tool for identifying structural variants using long and short reads. In: Abstracts from the 53rd European Society of Human Genetics (ESHG) Conference: Interactive e-Posters},
 url = {http://publications.imp.fu-berlin.de/2511/},
 volume = {28},
 year = {2020}
}

Robert Pieper, Temesgen H. Dadi, Laura Pieper, Wilfried Vahjen, André Franke, Knut Reinert, Jürgen Zentek, “Concentration and chemical form of dietary zinc shape the porcine colon microbiome, its functional capacity and antibiotic resistance gene repertoire”, vol. 14, iss. 11, 2020-08-03.

cite this publication

@article{fu_mi_publications2512,
 abstract = {Despite a well-documented effect of high dietary zinc oxide on the pig intestinal microbiota composition less is it yet known about changes in microbial functional properties or the effect of organic zinc sources. Forty weaning piglets in four groups were fed diets supplemented with 40 or 110 ppm zinc as zinc oxide, 110 ppm as Zn-Lysinate, or 2500 ppm as zinc oxide. Host zinc homeostasis, intestinal zinc fractions, and ileal nutrient digestibility were determined as main nutritional and physiological factors putatively driving colon microbial ecology. Metagenomic sequencing of colon microbiota revealed only clear differences at genus level for the group receiving 2500 ppm zinc oxide. However, a clear group differentiation according to dietary zinc concentration and source was observed at species level. Functional analysis revealed significant differences in genes related to stress response, mineral, and carbohydrate metabolism. Taxonomic and functional gene differences were accompanied with clear effects in microbial metabolite concentration. Finally, a selection of certain antibiotic resistance genes by dietary zinc was observed. This study sheds further light onto the consequences of concentration and chemical form of dietary zinc on microbial ecology measures and the resistome in the porcine colon.},
 author = {Robert Pieper and Temesgen H. Dadi and Laura Pieper and Wilfried Vahjen and Andr{\'e} Franke and Knut Reinert and J{\"u}rgen Zentek},
 journal = {The ISME Journal},
 month = {August},
 number = {11},
 pages = {2783--2793},
 publisher = {Nature Portfolio},
 title = {Concentration and chemical form of dietary zinc shape the porcine colon microbiome, its functional capacity and antibiotic resistance gene repertoire},
 url = {http://publications.imp.fu-berlin.de/2512/},
 volume = {14},
 year = {2020}
}

Bernhard Y Renard, Knut Reinert, Enrico Seiler, Temesgen H Dadi, Vitor C Piro, “ganon: precise metagenomics classification against large and up-to-date sets of reference sequences”, vol. 36, iss. Supple, 2020-07-13.

cite this publication

@article{fu_mi_publications2514,
 abstract = {Motivation: 

The exponential growth of assembled genome sequences greatly benefits metagenomics studies. However, currently available methods struggle to manage the increasing amount of sequences and their frequent updates. Indexing the current RefSeq can take days and hundreds of GB of memory on large servers. Few methods address these issues thus far, and even though many can theoretically handle large amounts of references, time/memory requirements are prohibitive in practice. As a result, many studies that require sequence classification use often outdated and almost never truly up-to-date indices.

Results: 

Motivated by those limitations, we created ganon, a k-mer-based read classification tool that uses Interleaved Bloom Filters in conjunction with a taxonomic clustering and a k-mer counting/filtering scheme. Ganon provides an efficient method for indexing references, keeping them updated. It requires {\ensuremath{<}}55 min to index the complete RefSeq of bacteria, archaea, fungi and viruses. The tool can further keep these indices up-to-date in a fraction of the time necessary to create them. Ganon makes it possible to query against very large reference sets and therefore it classifies significantly more reads and identifies more species than similar methods. When classifying a high-complexity CAMI challenge dataset against complete genomes from RefSeq, ganon shows strongly increased precision with equal or better sensitivity compared with state-of-the-art tools. With the same dataset against the complete RefSeq, ganon improved the F1-score by 65\% at the genus level. It supports taxonomy- and assembly-level classification, multiple indices and hierarchical classification.

Availability and implementation: 

The software is open-source and available at: https://gitlab.com/rki\_bioinformatics/ganon.},
 author = {Bernhard Y Renard and Knut Reinert and Enrico Seiler and Temesgen H Dadi and Vitor C Piro},
 journal = {Bioinformatics},
 month = {July},
 number = {Supple},
 pages = {i12--i20},
 publisher = {Oxford University Press, Oxford Academic},
 title = {ganon: precise metagenomics classification against large and up-to-date sets of reference sequences},
 url = {http://publications.imp.fu-berlin.de/2514/},
 volume = {36},
 year = {2020}
}

Knut Reinert, Costas S Iliopoulos, Mai Alzamel, Christopher Pockrandt, Jinbo Xu, “GenMap: ultra-fast computation of genome mappability”, vol. 36, iss. 12, 2020-04-04.

cite this publication

@article{fu_mi_publications2515,
 abstract = {Motivation: 

Computing the uniqueness of k-mers for each position of a genome while allowing for up to e mismatches is computationally challenging. However, it is crucial for many biological applications such as the design of guide RNA for CRISPR experiments. More formally, the uniqueness or (k, e)-mappability can be described for every position as the reciprocal value of how often this k-mer occurs approximately in the genome, i.e. with up to e mismatches.

Results: 

We present a fast method GenMap to compute the (k, e)-mappability. We extend the mappability algorithm, such that it can also be computed across multiple genomes where a k-mer occurrence is only counted once per genome. This allows for the computation of marker sequences or finding candidates for probe design by identifying approximate k-mers that are unique to a genome or that are present in all genomes. GenMap supports different formats such as binary output, wig and bed files as well as csv files to export the location of all approximate k-mers for each genomic position.

Availability and implementation: 

GenMap can be installed via bioconda. Binaries and C++ source code are available on https://github.com/cpockrandt/genmap.},
 author = {Knut Reinert and Costas S Iliopoulos and Mai Alzamel and Christopher Pockrandt and Jinbo Xu},
 journal = {Bioinformatics},
 month = {April},
 number = {12},
 pages = {3687--3692},
 publisher = {Oxford University Press, Oxford Academic},
 title = {GenMap: ultra-fast computation of genome mappability},
 url = {http://publications.imp.fu-berlin.de/2515/},
 volume = {36},
 year = {2020}
}

Alexander Schönhuth, M-Hossein Moeinzadeh, Jun Yang, Evgeny Muzychenko, Giuseppe Gallone, David Heller, Knut Reinert, Stefan Haas, Martin Vingron, “Ranbow: A fast and accurate method for polyploid haplotype reconstruction”, vol. 16, iss. 5, 2020-05-29.

cite this publication

@article{fu_mi_publications2516,
 abstract = {Reconstructing haplotypes from sequencing data is one of the major challenges in genetics. Haplotypes play a crucial role in many analyses, including genome-wide association studies and population genetics. Haplotype reconstruction becomes more difficult for higher numbers of homologous chromosomes, as it is often the case for polyploid plants. This complexity is compounded further by higher heterozygosity, which denotes the frequent presence of variants between haplotypes. We have designed Ranbow, a new tool for haplotype reconstruction of polyploid genome from short read sequencing data. Ranbow integrates all types of small variants in bi- and multi-allelic sites to reconstruct haplotypes. To evaluate Ranbow and currently available competing methods on real data, we have created and released a real gold standard dataset from sweet potato sequencing data. Our evaluations on real and simulated data clearly show Ranbow?s superior performance in terms of accuracy, haplotype length, memory usage, and running time. Specifically, Ranbow is one order of magnitude faster than the next best method. The efficiency and accuracy of Ranbow makes whole genome haplotype reconstruction of complex genome with higher ploidy feasible.},
 author = {Alexander Sch{\"o}nhuth and M-Hossein Moeinzadeh and Jun Yang and Evgeny Muzychenko and Giuseppe Gallone and David Heller and Knut Reinert and Stefan Haas and Martin Vingron},
 journal = {PLOS Computational Biology},
 month = {May},
 number = {5},
 pages = {e1007843},
 publisher = {PLOS},
 title = {Ranbow: A fast and accurate method for polyploid haplotype reconstruction},
 url = {http://publications.imp.fu-berlin.de/2516/},
 volume = {16},
 year = {2020}
}

Franziska Hufsky, Kevin Lamkiewicz, Alexandre Almeida, Abdel Aouacheria, Cecilia Arighi, Alex Bateman, Jan Baumbach, Niko Beerenwinkel, Christian Brandt, Marco Cacciabue, Sara Chuguransky, Oliver Drechsel, Robert D Finn, Adrian Fritz, Stephan Fuchs, Georges Hattab, Anne-Christin Hauschild, Dominik Heider, Marie Hoffmann, Martin Hölzer, Stefan Hoops, Lars Kaderali, Ioanna Kalvari, Max von Kleist, Renó Kmiecinski, Denise Kühnert, Gorka Lasso, Pieter Libin, Markus List, Hannah F Löchel, Maria J Martin, Roman Martin, Julian Matschinske, Alice C McHardy, Pedro Mendes, Jaina Mistry, Vincent Navratil, Eric P Nawrocki, Áine Niamh O’Toole, Nancy Ontiveros-Palacios, Anton I Petrov, Guillermo Rangel-Pineros, Nicole Redaschi, Susanne Reimering, Knut Reinert, Alejandro Reyes, Lorna Richardson, David L Robertson, Sepideh Sadegh, Joshua B Singer, Kristof Theys, Chris Upton, Marius Welzel, Lowri Williams, Manja Marz, “Computational strategies to combat COVID-19: useful tools to accelerate SARS-CoV-2 and coronavirus research”, vol. 22, iss. 2, 2020-11-04.

cite this publication

@article{fu_mi_publications2517,
 abstract = {SARS-CoV-2 (severe acute respiratory syndrome coronavirus 2) is a novel virus of the family Coronaviridae. The virus causes the infectious disease COVID-19. The biology of coronaviruses has been studied for many years. However, bioinformatics tools designed explicitly for SARS-CoV-2 have only recently been developed as a rapid reaction to the need for fast detection, understanding and treatment of COVID-19. To control the ongoing COVID-19 pandemic, it is of utmost importance to get insight into the evolution and pathogenesis of the virus. In this review, we cover bioinformatics workflows and tools for the routine detection of SARS-CoV-2 infection, the reliable analysis of sequencing data, the tracking of the COVID-19 pandemic and evaluation of containment measures, the study of coronavirus evolution, the discovery of potential drug targets and development of therapeutic strategies. For each tool, we briefly describe its use case and how it advances research specifically for SARS-CoV-2. All tools are free to use and available online, either through web applications or public code repositories.},
 author = {Franziska Hufsky and Kevin Lamkiewicz and Alexandre Almeida and Abdel Aouacheria and Cecilia Arighi and Alex Bateman and Jan Baumbach and Niko Beerenwinkel and Christian Brandt and Marco Cacciabue and Sara Chuguransky and Oliver Drechsel and Robert D Finn and Adrian Fritz and Stephan Fuchs and Georges Hattab and Anne-Christin Hauschild and Dominik Heider and Marie Hoffmann and Martin H{\"o}lzer and Stefan Hoops and Lars Kaderali and Ioanna Kalvari and Max von Kleist and Ren{\'o} Kmiecinski and Denise K{\"u}hnert and Gorka Lasso and Pieter Libin and Markus List and Hannah F L{\"o}chel and Maria J Martin and Roman Martin and Julian Matschinske and Alice C McHardy and Pedro Mendes and Jaina Mistry and Vincent Navratil and Eric P Nawrocki and {\'A}ine Niamh O?Toole and Nancy Ontiveros-Palacios and Anton I Petrov and Guillermo Rangel-Pineros and Nicole Redaschi and Susanne Reimering and Knut Reinert and Alejandro Reyes and Lorna Richardson and David L Robertson and Sepideh Sadegh and Joshua B Singer and Kristof Theys and Chris Upton and Marius Welzel and Lowri Williams and Manja Marz},
 journal = {Briefings in Bioinformatics},
 keywords = {virus bioinformatics, SARS-CoV-2, sequencing, epidemiology, drug design, tools},
 month = {November},
 number = {2},
 pages = {642--663},
 publisher = {Oxford University Press, Public Health Emergency Collection},
 title = {Computational strategies to combat COVID-19: useful tools to accelerate SARS-CoV-2 and coronavirus research},
 url = {http://publications.imp.fu-berlin.de/2517/},
 volume = {22},
 year = {2020}
}

Natalie Witt, Sandro Andreotti, Anne Busch, Kerstin Neubert, Knut Reinert, Herbert Tomaso, David Meierhofer, “Rapid and Culture Free Identification of Francisella in Hare Carcasses by High-Resolution Tandem Mass Spectrometry Proteotyping”, vol. 11, 2020-05-08.

cite this publication

@article{fu_mi_publications2518,
 abstract = {Zoonotic pathogens that can be transmitted via food to humans have a high potential for large-scale emergencies, comprising severe effects on public health, critical infrastructures, and the economy. In this context, the development of laboratory methods to rapidly detect zoonotic bacteria in the food supply chain, including high-resolution mass spectrometry proteotyping are needed. In this work, an optimized sample preparation method for liquid chromatography-tandem mass spectrometry (LC-MS/MS)-based proteome profiling was established for Francisella isolates, and a cluster analysis, as well as a phylogenetic tree, was generated to shed light on evolutionary relationships. Furthermore, this method was applied to tissues of infected hare carcasses from Germany. Even though the non-informative data outnumbered by a manifold the information of the zoonotic pathogen in the resulting proteome profiles, the standardized evaluation of MS data within an established automated analysis pipeline identified Francisella (F.) tularensis and, thus, could be, in principle, an applicable method to monitor food supply chains.},
 author = {Natalie Witt and Sandro Andreotti and Anne Busch and Kerstin Neubert and Knut Reinert and Herbert Tomaso and David Meierhofer},
 journal = {Frontiers in Microbiology},
 month = {May},
 publisher = {Frontiers Media},
 title = {Rapid and Culture Free Identification of Francisella in Hare Carcasses by High-Resolution Tandem Mass Spectrometry Proteotyping},
 url = {http://publications.imp.fu-berlin.de/2518/},
 volume = {11},
 year = {2020}
}

Evgenia Ntini, Stefan Budach, Ulf A Vang Ørom, Annalisa Marsico, “Predictive modeling of long non-coding RNA chromatin (dis-)association”, 2020-12-17.

cite this publication

@article{fu_mi_publications2572,
 abstract = {Long non-coding RNAs (lncRNAs) are involved in gene expression regulation in cis and trans. Although enriched in the chromatin cell fraction, to what degree this defines their broad range of functions remains unclear. In addition, the factors that contribute to lncRNA chromatin tethering, as well as the molecular basis of efficient lncRNA chromatin dissociation and its functional impact on enhancer activity and target gene expression, remain to be resolved. Here, we combine pulse-chase metabolic labeling of nascent RNA with chromatin fractionation and transient transcriptome sequencing to follow nascent RNA transcripts from their co-transcriptional state to their release into the nucleoplasm. By incorporating functional and physical characteristics in machine learning models, we find that parameters like co-transcriptional splicing contributes to efficient lncRNA chromatin dissociation. Intriguingly, lncRNAs transcribed from enhancer-like regions display reduced chromatin retention, suggesting that, in addition to splicing, lncRNA chromatin dissociation may contribute to enhancer activity and target gene expression.},
 author = {Evgenia Ntini and Stefan Budach and Ulf A Vang {\O}rom and Annalisa Marsico},
 booktitle = {Predictive modeling of long non-coding RNA chromatin (dis-)association},
 journal = {bioRxiv},
 month = {December},
 publisher = {Cold Spring Harbor Laboratory},
 title = {Predictive modeling of long non-coding RNA chromatin (dis-)association},
 url = {http://publications.imp.fu-berlin.de/2572/},
 year = {2020}
}

Sabrina Krakau, “Statistical models to capture protein-RNA interaction footprints from truncation-based CLIP-seq data”, 2020-01-15.

cite this publication

@phdthesis{fu_mi_publications2848,
 abstract = {Protein-RNA interactions play an important role in all post-transcriptional regulatory processes. High throughput detection of protein-RNA interactions has been facilitated by the emerging CLIP-seq (crosslinking and immunoprecipitation combined with high-throughput sequencing) techniques. Enrichments in mapped reads as well as base transitions or deletions at crosslink sites can be used to infer binding regions. Single-nucleotide resolution techniques (iCLIP and eCLIP) have been achieved by capturing high fractions of cDNAs which are truncated at protein-RNA crosslink sites. Increasing numbers of datasets and derivatives of these protocols have been published in recent years, requiring tailored computational analyses. Existing methods unfortunately do not explicitly model the specifics of truncation patterns and possible biases caused by background binding or crosslinking sequence preferences. We present PureCLIP, a hidden Markov model based approach, which simultaneously performs peak calling and individual crosslink site detection. It is capable of incorporating external data to correct for non-specific background signals and, for the first time, for the crosslinking biases. We devised a comprehensive evaluation based on three strategies. Firstly, we developed a workflow to simulate iCLIP data, which starts from real RNA-seq data and known binding regions and then mimics the experimental steps of the iCLIP protocol, including the generation of background signals. Secondly, we used experimental iCLIP and eCLIP datasets, using the proteins? known predominant binding regions. And thirdly, we assessed the agreement of called sites between replicates, assuming target-specific signals are reproducible between replicates. On both simulated and real data, PureCLIP is consistently more precise in calling crosslink sites than other state-of-the-art methods. In particular when incorporating input control data and crosslink associated motifs (CL-motifs) PureCLIP is up to 13\% more precise than other methods and we show that it has an up to 20\% higher agreement across replicates. Moreover, our method can optionally merge called crosslink sites to binding regions based on their distance and we show that the resulting regions reflect the known binding regions with high-resolution. Additionally, we demonstrate that our method achieves a high precision robustly over a range of different settings and performs well for proteins with different binding characteristics. Lastly, we extended the method to include individual CLIP replicates and show that this can boost the precision even further. PureCLIP and its documenta- tion are publicly available at https://github.com/skrakau/PureCLIP.},
 author = {Sabrina Krakau},
 month = {January},
 school = {Freie Universit{\"a}t Berlin},
 title = {Statistical models to capture protein-RNA interaction footprints from truncation-based CLIP-seq data},
 url = {http://publications.imp.fu-berlin.de/2848/},
 year = {2020}
}

Marten Jäger, “Annotation und Interpretation von Varianten und Polymorphismen im humanen Genom”, 2020-03-09.

cite this publication

@phdthesis{fu_mi_publications2853,
 abstract = {Das Gesamtvolumen genomischer Sequenzierungsdaten nimmt, dank der Entwicklung der DNA-Hochdurchsatz-Sequenziertechniken, in den letzten Jahren in unglaublichem Tempo zu. Dies erweitert unser Wissen des humanen Genoms {\"u}ber dessen Aufbau, die Struktur und die r{\"a}umliche Organisation. Die Erkenntnis um den komplexen Aufbau wird in naher Zukunft in die Interpretation von Variationen auch im klinischen Kontext einflie{\ss}en m{\"u}ssen, birgt sie doch zahlreiche potentielle M{\"o}glichkeiten in der Diagnostik. Whole Genome Sequencing hat schon jetzt den Sprung aus den Forschungslaboren in die angewandte Diagnostik von Krankenh{\"a}usern geschafft und erlaubt damit die Einf{\"u}hrung der Pr{\"a}zisionsmedizin f{\"u}r alle Patienten. F{\"u}r eine optimale klinische Interpretation genomischer Varianten ist es wichtig, konsistente und passende Referenzen zu verwenden. Hierzu z{\"a}hlt neben der Auswahl des Referenzgenoms auch die verwendete Datenbank zur Annotierung von funktionalen Einheiten auf der DNA. Diese Arbeit geht auf zwei wichtige Schritte auf dem Weg zum Einsatz des WGS im klinischen Alltag ein. Der erste Schritt beinhaltet, m{\"o}glichst schnell die gefundenen Varianten zu genomischen Eigenschaften und Featu- res (in Relation zu einer Referenz) zuzuordnen. Dies ist aufgrund der gro{\ss}en Datenmengen ein zunehmendes Problem geworden. Mit Jannovar wird hier eine Softwarebibliothek vorgestellt, welche hervorragend an diese Anspr{\"u}che angepasst ist. Die Bibliothek ist schnell, flexibel und kann leicht in Annotationspipelines und eigene Programme integriert werden. Die so annotierten und charakterisierten Ver{\"a}nderungen des Genotyps bilden eine Basis f{\"u}r die weitere Interpretation und Beurteilung durch andere Programme. Die Repr{\"a}sentation der Genomreferenz entwickelt sich hin zu einem Graphengenom, um die populationsspezifische Variabilit{\"a}t zumindest ansatzweise abzubilden. Diese kann einen enormen Einfluss auf die Interpretation von Varianten haben. Im zweiten Schritt geht es darum, diese populationsspezifische Komplexit{\"a}t zu erl{\"a}utern. Mit ASDPex wird ein heuristischer Algorithmus vorgestellt, welcher f{\"u}r WGS-Daten eines Individuums das Auftreten von alternativen Haplotypsequenzen vorhersagt. Daf{\"u}r verwendet es die Verteilung der Allelfrequenzen der individuellen Varianten und gleicht sie mit einer Art Fingerabdruck aus haplotypspezifischen Varianten ab. Das Wissen um die alternativen Sequenzen kann die Verl{\"a}sslichkeit der klinischen Interpretation weiter verbessern. Zuk{\"u}nftig wird es darum gehen, noch mehr Daten in die Varianteninterpretation zu integrieren, um noch mehr falsch positive/falsch negative Assoziationen zu verhindern und irrelevante Varianten herauszufiltern.},
 author = {Marten J{\"a}ger},
 month = {March},
 school = {Freie Universit{\"a}t Berlin},
 title = {Annotation und Interpretation von Varianten und Polymorphismen im humanen Genom},
 url = {http://publications.imp.fu-berlin.de/2853/},
 year = {2020}
}

2019

Laurence O. W. Wilson, Sara Hetzel, Christopher Pockrandt, Knut Reinert, Denis C. Bauer, “VARSCOT: variant-aware detection and scoring enables sensitive and personalized off-target detection for CRISPR-Cas9”, vol. 19, iss. 1, 2019-06-27.

cite this publication

@article{fu_mi_publications2430,
 abstract = {Background

Natural variations in a genome can drastically alter the CRISPR-Cas9 off-target landscape by creating or removing sites. Despite the resulting potential side-effects from such unaccounted for sites, current off-target detection pipelines are not equipped to include variant information. To address this, we developed VARiant-aware detection and SCoring of Off-Targets (VARSCOT).

Results

VARSCOT identifies only 0.6\% of off-targets to be common between 4 individual genomes and the reference, with an average of 82\% of off-targets unique to an individual. VARSCOT is the most sensitive detection method for off-targets, finding 40 to 70\% more experimentally verified off-targets compared to other popular software tools and its machine learning model allows for CRISPR-Cas9 concentration aware off-target activity scoring.

Conclusions

VARSCOT allows researchers to take genomic variation into account when designing individual or population-wide targeting strategies. VARSCOT is available from https://github.com/BauerLab/VARSCOT.},
 author = {Laurence O. W. Wilson and Sara Hetzel and Christopher Pockrandt and Knut Reinert and Denis C. Bauer},
 journal = {BMC Biotechnology},
 keywords = {CRISPR-Cas9, Off-target detection, Variants, Genome editing},
 month = {June},
 number = {1},
 title = {VARSCOT: variant-aware detection and scoring enables sensitive and personalized off-target detection for CRISPR-Cas9},
 url = {http://publications.imp.fu-berlin.de/2430/},
 volume = {19},
 year = {2019}
}

Enrico Seiler, Kathrin Trappe, Bernhard Y. Renard, “Where did you come from, where did you go: Refining metagenomic analysis tools for horizontal gene transfer characterisation”, vol. 15, iss. 7, 2019-07-23.

cite this publication

@article{fu_mi_publications2434,
 abstract = {Horizontal gene transfer (HGT) has changed the way we regard evolution. Instead of waiting for the next generation to establish new traits, especially bacteria are able to take a shortcut via HGT that enables them to pass on genes from one individual to another, even across species boundaries. The tool Daisy offers the first HGT detection approach based on read mapping that provides complementary evidence compared to existing methods. However, Daisy relies on the acceptor and donor organism involved in the HGT being known. We introduce DaisyGPS, a mapping-based pipeline that is able to identify acceptor and donor reference candidates of an HGT event based on sequencing reads. Acceptor and donor identification is akin to species identification in metagenomic samples based on sequencing reads, a problem addressed by metagenomic profiling tools. However, acceptor and donor references have certain properties such that these methods cannot be directly applied. DaisyGPS uses MicrobeGPS, a metagenomic profiling tool tailored towards estimating the genomic distance between organisms in the sample and the reference database. We enhance the underlying scoring system of MicrobeGPS to account for the sequence patterns in terms of mapping coverage of an acceptor and donor involved in an HGT event, and report a ranked list of reference candidates. These candidates can then be further evaluated by tools like Daisy to establish HGT regions. We successfully validated our approach on both simulated and real data, and show its benefits in an investigation of an outbreak involving Methicillin-resistant Staphylococcus aureus data.

Author summary

Evolution is traditionally viewed as a process where changes are only vertically inherited from parent to offspring across generations. Many principles such as phylogenetic trees and even the ?tree of life? are based on that doctrine. The concept of horizontal gene transfer changed the way we regard evolution completely. Horizontal gene transfer is the movement of genetic information between distantly related organisms of the same generation. Genome sequencing not only provided further evidence complementing experimental evidence but also shed light onto the frequency and prominence of this concept. Especially the rapid spread of antimicrobial resistance genes is a prominent example for the impact that horizontal gene transfer can have for public health. Next generation sequencing brought means for quick and relatively cheap analysis of even complex metagenomic samples where horizontal gene transfer is bound to happen frequently. Methods to directly detect and characterise horizontal gene transfer from such sequencing data, however, are still lacking. We here provide a method to identify organisms potentially involved in horizontal gene transfer events to be used in downstream analysis that enables a characterisation of a horizontal gene transfer event in terms of impact and prevalence.},
 author = {Enrico Seiler and Kathrin Trappe and Bernhard Y. Renard},
 journal = {PLOS Computational Biology},
 month = {July},
 number = {7},
 pages = {e1007208},
 publisher = {Public Library of Science},
 title = {Where did you come from, where did you go: Refining metagenomic analysis tools for horizontal gene transfer characterisation},
 url = {http://publications.imp.fu-berlin.de/2434/},
 volume = {15},
 year = {2019}
}

Christopher Maximilian Pockrandt, “Approximate String Matching - Improving Data Structures and Algorithms”, 2019-04-15.

cite this publication

@phdthesis{fu_mi_publications2523,
 abstract = {This thesis addresses important algorithms and data structures used in sequence analysis for applications such as read mapping. First, we give an overview on state-of-the-art FM indices and present the latest improvements. In particular, we will introduce a recently published FM index based on a new data structure: EPR dictionaries. This rank data structures allows search steps in constant time for unidirectional and bidirectional FM indices. To our knowledge this is the first and only constant-time implementation of a bidirectional FM index at the time of writing. We show that its running time is not only optimal in theory, but currently also outperforms all available FM index implementations in practice.

Second, we cover approximate string matching in bidirectional indices. To improve the running time and make higher error rates suitable for index-based searches, we introduce an integer linear program for finding optimal search strategies. We show that it is significantly faster than other search strategies in indices and cover additional improvements such as hybrid approaches of index-based searches with in-text verification, i.e., at some point the partially matched string is located and verified directly in the text.

Finally, we present a yet unpublished algorithm for fast computation of the mappability of genomic sequences. Mappability is a measure for the uniqueness of a genome by counting how often each \$k\$-mer of the sequence occurs with a certain error threshold in the genome itself. We suggest two applications of mappability with prototype implementations: First, a read mapper incorporating the mappability information to improve the running time when mapping reads that match highly repetitive regions, and second, we use the mappability information to identify phylogenetic markers in a set of similar strains of the same species by the example of E. coli. Unique regions allow identifying and distinguishing even highly similar strains using unassembled sequencing data.

The findings in this thesis can speed up many applications in bioinformatics as we demonstrate for read mapping and computation of mappability, and give suggestions for further research in this field.},
 author = {Christopher Maximilian Pockrandt},
 month = {April},
 school = {Freie Universit{\"a}t Berlin},
 title = {Approximate String Matching - Improving Data Structures and Algorithms},
 url = {http://publications.imp.fu-berlin.de/2523/},
 year = {2019}
}

Temesgen Hailemariam Dadi, “Whole Genome Shotgun Sequencing Based Taxonomic Profiling Methods for Comparative Study of Microbial Communities”, 2019-05-03.

cite this publication

@phdthesis{fu_mi_publications2524,
 abstract = {Microorganisms, typically occurring as large, species diverse communities, are a ubiquitous part of nature. These communities are a vital part of their environment, influencing it through various layers of interaction. Host-associated microbial communities are particularly scrutinized for their influence on the host?s health. Additionally, there is a growing interest in microbial communities due to their role in livestock, agriculture, waste treatment, mining, and biotechnology. Metagenomics is a relatively young scientific field that aims to study such microbial communities based on genetic material recovered directly from an environment. Advances in DNA sequencing have enabled us to perform taxonomic profiling, i.e. to identify microbial species quantitatively and qualitatively at increasing depth.

In whole genome shotgun sequencing (WGS), environmental DNA is taken directly from an environment and sequenced after being fragmented, without PCR amplification. Taxonomic profiling methods based on such sequencing data introduce less PCR bias compared to their amplicon based counterparts such as 16S-rDNA based profiling methods. However, the challenges posed by the enormous and redundancy of databases and the high degree homology among reference genomes of microorganisms put WGS methods at a disadvantage. In this thesis, we will present and discuss two separate computational methods that address both challenges.

The first method is a taxonomic profiler that leverages coverage landscapes created by mapping sequencing reads across reference genomes to address the challenge posed by homologous regions of genomes. By carefully evaluating the coverage profile of reference genomes we drop spurious references from consideration. This filtration strategy results in more uniquely mapping reads to the remaining reference genomes improving both the resolution and accuracy of the taxonomic profiling process. We have also shown that this method improves the quality of relative abundances assigned to each detected member organism.

The second method is a distributed read mapper which addresses the issue of large and frequently changing databases by systematically partitioning it into smaller bins. It significantly reduces the time, and computational resources required to build indices from such large databases by orders of magnitudes and updates can be performed very quickly in a few minutes compared to days in earlier methods. To achieve a competitive mapping speed while maintaining many small indices, we implemented a novel, fast and lightweight filtering data structure called interleaved bloom filter. With that, we are able to achieve the described improvements in the index building and updating time without compromising the read-mapping speed.},
 author = {Temesgen Hailemariam Dadi},
 month = {May},
 school = {Freie Universit{\"a}t Berlin},
 title = {Whole Genome Shotgun Sequencing Based Taxonomic Profiling Methods for Comparative Study of Microbial Communities},
 url = {http://publications.imp.fu-berlin.de/2524/},
 year = {2019}
}

Ulrike Löber, “Development of Bioinformatic Tools for Retroviral Analysis from High Throughput Sequence Data”, 2019-06-26.

cite this publication

@phdthesis{fu_mi_publications2533,
 abstract = {For hundreds of millions of years, retroviruses have been integrating into genomes of vertebrates. This thesis contributes to the development of new methods for retrieval, characterization and the comparison of viruses that have integrated into the genome (endogenous retroviruses, or ERVs) and their integration sites in host genomes. The koala retrovirus is an outstanding study subject since it is currently in the transition from an exogenous to an endogenous retrovirus. In the past decades, high-throughput sequencing (HTS) has allowed scientists to investigate genomic data at high coverage and low costs. However, the development of new sequencing technologies facilitated the production of vast amounts of data. The analysis bottleneck has shifted from data production to the analysis of so-called ?big data?. In consequence, new algorithms and pipelines need to be established to process biological data. Solutions for automated handling of short-read HTS data exist for many problems and can be improved and extended. Recent improvements in HTS resulting in longer sequence fragments have helped solve problems connected to short-read sequencing but produced new challenges for genomics data processing. In this thesis, I present pipelines to comprehensively profile endogenous retroviruses from short-read HTS data for museum koala samples (ancient DNA) and describe a new method to amplify retroviral integration sites facilitating long-read HTS. The thesis is divided into five sections. In the first part, I describe the biological problem, the evolution of sequencing technologies, resulting in information technology problems and proposed solutions (chapter 1). In the second chapter, I present a comparison of three different target enrichment techniques to retrieve retroviral integration sites from museum koala samples. The computational pipeline I developed for this purpose is presented. In chapter 3 I describe a method (sonication inverse polymerase chain reaction) for target enrich- ment of long sequence fragments to exploit the capacities of third-generation sequencing technologies. An analysis pipeline for the processing of sonication inverse PCR products was established. Moreover, the remaining problems resulting from artificial read structures are discussed. In chapter 4 the method described in chapter 3 was used to profile koala retrovirus integrations. The striking discovery of a new retroviral recombinant in koalas is reported. Finally, I discuss our findings and compare short- and long-read HTS technologies. An outlook for further applications and remaining computational problems is outlined. Overall, this thesis contributes to the automated computational processing of HTS data from target enrichment techniques to profile endogenous retroviruses in host genomes.},
 author = {Ulrike L{\"o}ber},
 month = {June},
 school = {Freie Universit{\"a}t Berlin},
 title = {Development of Bioinformatic Tools for Retroviral Analysis from High Throughput Sequence Data},
 url = {http://publications.imp.fu-berlin.de/2533/},
 year = {2019}
}

2018

T. Marschall, K. Reinert, (59 authors in total) others, “Computational pan-genomics: status, promises and challenges”, vol. 19, iss. 1, 2018-01-01.

cite this publication

@article{fu_mi_publications1981,
 abstract = {Many disciplines, from human genetics and oncology to plant breeding, microbiology and virology, commonly face the challenge of analyzing rapidly increasing numbers of genomes. In case of Homo sapiens, the number of sequenced genomes will approach hundreds of thousands in the next few years. Simply scaling up established bioinformatics pipelines will not be sufficient for leveraging the full potential of such rich genomic data sets. Instead, novel, qualitatively different computational methods and paradigms are needed. We will witness the rapid extension of computational pan-genomics, a new sub-area of research in computational biology. In this article, we generalize existing definitions and understand a pan-genome as any collection of genomic sequences to be analyzed jointly or to be used as a reference. We examine already available approaches to construct and use pan-genomes, discuss the potential benefits of future technologies and methodologies and review open challenges from the vantage point of the above-mentioned biological disciplines. As a prominent example for a computational paradigm shift, we particularly highlight the transition from the representation of reference genomes as strings to representations as graphs. We outline how this and other challenges from different application domains translate into common computational problems, point out relevant bioinformatics techniques and identify open problems in computer science. With this review, we aim to increase awareness that a joint approach to computational pan-genomics can help address many of the problems currently faced in various domains.},
 author = {T. Marschall and K. Reinert and (59 authors in total) others},
 journal = {Briefings in Bioinformatics},
 month = {January},
 number = {1},
 pages = {118--135},
 publisher = {Oxford University Press},
 title = {Computational pan-genomics: status, promises and challenges},
 url = {http://publications.imp.fu-berlin.de/1981/},
 volume = {19},
 year = {2018}
}

René Rahn, Stefan Budach, Pascal Costanza, Marcel Ehrhardt, Jonny Hancox, Knut Reinert, “Generic accelerated sequence alignment in SeqAn using vectorization and multi-threading”, vol. 34, iss. 20, 2018-10-15.

cite this publication

@article{fu_mi_publications2253,
 abstract = {Motivation

Pairwise sequence alignment is undoubtedly a central tool in many bioinformatics analyses. In this paper, we present a generically accelerated module for pairwise sequence lignments applicable for a broad range of applications. In our module, we unified the standard dynamic programming kernel used for pairwise sequence alignments and extended it with a generalized inter-sequence vectorization layout, such that many alignments can be computed simultaneously by exploiting SIMD (Single Instruction Multiple Data) instructions of modern processors. We then extended the module by adding two layers of thread-level parallelization, where we a) distribute many independent alignments on multiple threads and b) inherently parallelize a single alignment computation using a work stealing approach producing a dynamic wavefront progressing along the minor diagonal.

Results

We evaluated our alignment vectorization and parallelization on different processors, including the newest Intel? Xeon? (Skylake) and Intel? Xeon Phi? (KNL) processors, and use cases. The instruction set AVX512-BW (Byte and Word), available on Skylake processors, can genuinely improve the performance of vectorized alignments. We could run single alignments 1600 times faster on the Xeon Phi? and 1400 times faster on the Xeon? than executing them with our previous sequential alignment module.

Availability

The module is programmed in C++ using the SeqAn (Reinert et al., 2017) library and distributed with version 2.4. under the BSD license. We support SSE4, AVX2, AVX512 instructions and included UME::SIMD, a SIMD-instruction wrapper library, to extend our module for further instruction sets. We thoroughly test all alignment components with all major C++ compilers on various platforms.},
 author = {Ren{\'e} Rahn and Stefan Budach and Pascal Costanza and Marcel Ehrhardt and Jonny Hancox and Knut Reinert},
 journal = {Bioinformatics},
 month = {October},
 number = {20},
 pages = {3437--3445},
 publisher = {Oxford Academic (OUP)},
 title = {Generic accelerated sequence alignment in SeqAn using vectorization and multi-threading},
 url = {http://publications.imp.fu-berlin.de/2253/},
 volume = {34},
 year = {2018}
}

Temesgen Hailemariam Dadi, Enrico Siragusa, Vitor C Piro, Andreas Andrusch, Enrico Seiler, Bernhard Y Renard, Knut Reinert, “DREAM-Yara: an exact read mapper for very large databases with short update time”, vol. 34, iss. 17, 2018-09-08.

cite this publication

@article{fu_mi_publications2282,
 abstract = {Motivation

Mapping-based approaches have become limited in their application to very large sets of references since computing an FM-index for very large databases (e.g. {\ensuremath{>}}10 GB) has become a bottleneck. This affects many analyses that need such index as an essential step for approximate matching of the NGS reads to reference databases. For instance, in typical metagenomics analysis, the size of the reference sequences has become prohibitive to compute a single full-text index on standard machines. Even on large memory machines, computing such index takes about 1 day of computing time. As a result, updates of indices are rarely performed. Hence, it is desirable to create an alternative way of indexing while preserving fast search times.

Results

To solve the index construction and update problem we propose the DREAM (Dynamic seaRchablE pArallel coMpressed index) framework and provide an implementation. The main contributions are the introduction of an approximate search distributor via a novel use of Bloom filters. We combine several Bloom filters to form an interleaved Bloom filter and use this new data structure to quickly exclude reads for parts of the databases where they cannot match. This allows us to keep the databases in several indices which can be easily rebuilt if parts are updated while maintaining a fast search time. The second main contribution is an implementation of DREAM-Yara a distributed version of a fully sensitive read mapper under the DREAM framework.

Availability and implementation: 
https://gitlab.com/pirovc/dream\_yara/},
 author = {Temesgen Hailemariam Dadi and Enrico Siragusa and Vitor C Piro and Andreas Andrusch and Enrico Seiler and Bernhard Y Renard and Knut Reinert},
 journal = {Bioinformatics},
 month = {September},
 number = {17},
 pages = {i766--i772},
 title = {DREAM-Yara: an exact read mapper for very large databases with short update time},
 url = {http://publications.imp.fu-berlin.de/2282/},
 volume = {34},
 year = {2018}
}

Łukasz Grześkowiak, Beatriz Martínez-Vallespín, Temesgen H Dadi, Judith Radloff, Salah Amasheh, Femke-Anouska Heinsen, Andre Franke, Knut Reinert, Wilfried Vahjen, Jürgen Zentek, Robert Pieper, “Formula Feeding Predisposes Neonatal Piglets to Clostridium difficile Gut Infection”, vol. 217, iss. 9, 2018-05-01.

cite this publication

@article{fu_mi_publications2283,
 abstract = {Spontaneous outbreaks of Clostridium difficile infection (CDI) occur in neonatal piglets, but the predisposing factors are largely not known. To study the conditions for C. difficile colonization and CDI development, 48 neonatal piglets were moved into isolators, fed bovine milk?based formula, and infected with C. difficile 078. Analyses included clinical scoring; measurement of the fecal C. difficile burden, toxin B level, and calprotectin level; and postmortem histopathological analysis of colon specimens. Controls were noninfected suckling piglets. Fecal specimens from suckling piglets, formula-fed piglets, and formula-fed, C. difficile?infected piglets were used for metagenomics analysis. High background levels of C. difficile and toxin were detected in formula-fed piglets prior to infection, while suckling piglets carried about 3-fold less C. difficile, and toxin was not detected. Toxin level in C. difficile?challenged animals correlated positively with C. difficile and calprotectin levels. Postmortem signs of CDI were absent in suckling piglets, whereas mesocolonic edema and gas-filled distal small intestines and ceca, cellular damage, and reduced expression of claudins were associated with animals from the challenge trials. Microbiota in formula-fed piglets was enriched with Escherichia, Shigella, Streptococcus, Enterococcus, and Ruminococcus species. Formula-fed piglets were predisposed to C. difficile colonization earlier as compared to suckling piglets. Infection with a hypervirulent C. difficile ribotype did not aggravate the symptoms of infection. Sow-offspring association and consumption of porcine milk during early life may be crucial for the control of C. difficile expansion in piglets.},
 author = {{\L}ukasz Grze{\'s}kowiak and Beatriz Mart{\'i}nez-Vallesp{\'i}n and Temesgen H Dadi and Judith Radloff and Salah Amasheh and Femke-Anouska Heinsen and Andre Franke and Knut Reinert and Wilfried Vahjen and J{\"u}rgen Zentek and Robert Pieper},
 journal = {The Journal of Infectious Diseases},
 month = {May},
 number = {9},
 pages = {1442--1452},
 title = {Formula Feeding Predisposes Neonatal Piglets to Clostridium difficile Gut Infection},
 url = {http://publications.imp.fu-berlin.de/2283/},
 volume = {217},
 year = {2018}
}

Kiavash Kianfar, Christopher Pockrandt, Bahman Torkamandi, Haochen Luo, Knut Reinert, “Optimum Search Schemes for Approximate String Matching Using Bidirectional FM-Index”, 2018-04-14.

cite this publication

@article{fu_mi_publications2284,
 abstract = {Finding approximate occurrences of a pattern in a text using a full-text index is a central problem in bioinformatics and has been extensively researched. Bidirectional indices have opened new possibilities in this regard allowing the search to start from anywhere within the pattern and extend in both directions. In particular, use of search schemes (partitioning the pattern and searching the pieces in certain orders with given bounds on errors) can yield significant speed-ups. However, finding optimal search schemes is a difficult combinatorial optimization problem. Here for the first time, we propose a mixed integer program (MIP) capable to solve this optimization problem for Hamming distance with given number of pieces. Our experiments show that the optimal search schemes found by our MIP significantly improve the performance of search in bidirectional FM-index upon previous ad-hoc solutions. For example, approximate matching of 101-bp Illumina reads (with two errors) becomes 35 times faster than standard backtracking. Moreover, despite being performed purely in the index, the running time of search using our optimal schemes (for up to two errors) is comparable to the best state-of-the-art aligners, which benefit from combining search in index with in-text verification using dynamic programming. As a result, we anticipate a full-fledged aligner that employs an intelligent combination of search in the bidirectional FM-index using our optimal search schemes and in-text verification using dynamic programming outperforms today's best aligners. The development of such an aligner, called FAMOUS (Fast Approximate string Matching using OptimUm search Schemes), is ongoing as our future work.},
 author = {Kiavash Kianfar and Christopher Pockrandt and Bahman Torkamandi and Haochen Luo and Knut Reinert},
 booktitle = {Optimum Search Schemes for Approximate String Matching Using Bidirectional FM-Index},
 journal = {bioRxiv, The Preprint Server for Biology},
 month = {April},
 title = {Optimum Search Schemes for Approximate String Matching Using Bidirectional FM-Index},
 url = {http://publications.imp.fu-berlin.de/2284/},
 year = {2018}
}

Anne Busch, Prasad Thomas, Eric Zuchantke, Holger Brendebach, Kerstin Neubert, Josephine Gruetzke, Sascha Al Dahouk, Martin Peters, Helmut Hotzel, Heinrich Neubauer, Herbert Tomaso, “Revisiting Francisella tularensis subsp. holarctica, Causative Agent of Tularemia in Germany With Bioinformatics: New Insights in Genome Structure, DNA Methylation and Comparative Phylogenetic Analysis”, vol. 9, 2018-03-13.

cite this publication

@article{fu_mi_publications2374,
 abstract = {Francisella (F.) tularensis is a highly virulent, Gram-negative bacterial pathogen and the causative agent of the zoonotic disease tularemia. Here, we generated, analyzed and characterized a high quality circular genome sequence of the F. tularensis subsp. holarctica strain 12T0050 that caused fatal tularemia in a hare. Besides the genomic structure, we focused on the analysis of oriC, unique to the Francisella genus and regulating replication in and outside hosts and the first report on genomic DNA methylation of a Francisella strain. The high quality genome was used to establish and evaluate a diagnostic whole genome sequencing pipeline. A genotyping strategy for F. tularensis was developed using various bioinformatics tools for genotyping. Additionally, whole genome sequences of F. tularensis subsp. holarctica isolates isolated in the years 2008?2015 in Germany were generated. A phylogenetic analysis allowed to determine the genetic relatedness of these isolates and confirmed the highly conserved nature of F. tularensis subsp. holarctica.},
 author = {Anne Busch and Prasad Thomas and Eric Zuchantke and Holger Brendebach and Kerstin Neubert and Josephine Gruetzke and Sascha Al Dahouk and Martin Peters and Helmut Hotzel and Heinrich Neubauer and Herbert Tomaso},
 journal = {Frontiers in Microbiology},
 month = {March},
 title = {Revisiting Francisella tularensis subsp. holarctica, Causative Agent of Tularemia in Germany With Bioinformatics: New Insights in Genome Structure, DNA Methylation and Comparative Phylogenetic Analysis},
 url = {http://publications.imp.fu-berlin.de/2374/},
 volume = {9},
 year = {2018}
}

Sven-Leon Kuchenbecker, “Analysis of Antigen Receptor Repertoires Captured by High Throughput Sequencing”, 2018-06-11.

cite this publication

@phdthesis{fu_mi_publications2535,
 abstract = {In vertebrate species, the main mechanisms of defence against various types of pathogens are divided into the innate and the adaptive immune system. While the former relies on generic mechanisms, for example to detect the presence of bacterial cells, the latter features mechanisms that allow the individual to acquire defenses against specific, potentially novel features of pathogens and to maintain them throughout life. In a simplified sense, the adaptive immune system continuously generates new defenses against all kinds of structures randomly, carefully selecting them not to be reactive against the hosts own cells. The underlying generative mechanism is a unique somatic recombination process modifying the genes encoding the proteins responsible for the recognition of such foreign structures, the so-called antigen receptors. With the advances of high throughput DNA sequencing, we have gained the ability to capture the repertoire of different antigen receptor genes that an individual has acquired by selectively sequencing the recombined loci from a cell sample. This enables us to examine and explore the development and behaviour of the adaptive immune system in a new way, with a variety of potential medical applications. The main focus of this thesis is on two computational problems related to immune repertoire sequencing. Firstly, we developed a method to properly annotate the raw sequencing data that is generated in such experiments, taking into account various sources of biases and errors that either generally occur in the context of DNA sequencing or are specific for immune repertoire sequencing experiments. We will describe the algorithmic details of this method and then demonstrate its superiority in comparison with previously published methods on various datasets. Secondly, we developed a machine learning based workflow to interpret this data in the sense that we attempted to classify such recombined genes functionally using a previously trained model. We implemented alternative models within this workflow, which we will first describe formally and then assess their performances on real data in the context of a binary functional feature in T cells, namely whether they have differentiated into cytotoxic or helper T cells.},
 author = {Sven-Leon Kuchenbecker},
 month = {June},
 school = {Freie Universit{\"a}t Berlin},
 title = {Analysis of Antigen Receptor Repertoires Captured by High Throughput Sequencing},
 url = {http://publications.imp.fu-berlin.de/2535/},
 year = {2018}
}

Vitor C Piro, “Reference and taxonomy based methods for classification and abundance estimation of organisms in metagenomic samples”, 2018-11-30.

cite this publication

@phdthesis{fu_mi_publications2538,
 abstract = {Metagenomics provides the means to study the vast and still mostly unknown microbial world which comprises at least half of earth's genetic diversity. Computational metagenomics enables those discoveries via analysis of large amounts of data which are being generated in a fast pace with high-throughput technologies. Reference-based methods are commonly used to study environmental samples based on a set of previously assembled reference sequences which are often linked to a taxonomic classification. Finding the origin of each sequenced fragment and profiling an environmental sample as a whole are the main goals of binning and taxonomic profiling tools, respectively.

In this thesis I present three methods in computational metagenomics. Sets of curated reference sequences jointly with taxonomic classification are employed to characterize community samples. The main goal of those contributions is to improve the state-of-the-art of taxonomic profiling and binning, with fast, sensitive and precise methods.

First I present ganon, a sequence classification tool for metagenomics which works with a very large number of reference sequences. Ganon provides an efficient method to index sequences and to keep those indices updated in very short time. In addition, ganon performs taxonomic binning with strongly improved precision compared to the current available methods. For a general profiling of metagenomic samples and abundance estimation I introduce DUDes. Rather than predicting strains in the sample based only on relative abundances, DUDes first identifies possible candidates by comparing the strength of mapped reads in each node of the taxonomic tree in an iterative top-down manner. This technique works in an opposite direction of the lowest common ancestor approach. Lastly, I present MetaMeta, a pipeline to execute metagenome analysis tools and integrate their results. MetaMeta is a method to combine and enhance results from multiple taxonomic binning and profiling tools and at the same time a pipeline to easily execute tools and analyze environmental data. MetaMeta includes database generation, pre-processing, execution, and integration steps, allowing easy installation, visualization and parallelization of state-of-the-art tools. Using the same input data, MetaMeta provides more sensitive and reliable results with the presence of each identified organism being supported by several methods.

Those three projects introduce new methodologies and improved results over similar methods, constituting valuable contributions to characterize communities in a reference and taxonomy-based manner.},
 author = {Vitor C Piro},
 month = {November},
 school = {Freie Universit{\"a}t Berlin},
 title = {Reference and taxonomy based methods for classification and abundance estimation of organisms in metagenomic samples},
 url = {http://publications.imp.fu-berlin.de/2538/},
 year = {2018}
}

Kathrin Trappe, “Computational Methods for Integrative Structural Variant Analysis Across Species Boundaries”, 2018-11-29.

cite this publication

@phdthesis{fu_mi_publications2541,
 abstract = {Structural variations (SVs) are a phenomenon that have a tremendous impact on all species. SVs are the result of fundamental rearrangement mechanisms but can lead to severe human diseases like cancer. Rearrangement events also provide means that enable bacteria to adapt to environmental pressures where they can also happen across species boundaries in events called horizontal gene transfer (HGT). The incorporation of foreign genes from a donor into an acceptor genome can be investigated on the genomic level, the activity and protein expression changes, however, are better revealed on the proteomic level. This thesis contributes four computational methods for the detection of complex SVs of various types and sizes including HGT events from genomic next-generation sequencing (NGS) data and proteomic shotgun mass-spectrometry (MS) data. Concerning HGT events, our methods address the questions of what organisms are involved in the transfer, what genes are exactly transferred and to what position, and what are the implications on proteomic level. First, we present the generic SV detection tool Gustaf. Gustaf improves the size and type resolution compared to previous SV detection methods. A further specific advantage is the characterisation of translocations and dispersed duplications as a combination of simple, delocalised variants that have to be inferred from separate SV calls. With this basis for a more in-depth focus on HGT detection, we developed two mapping-based methods, Daisy and DaisyGPS. Daisy facilitates Gustaf and further SV detection strategies to precisely identify the transferred region within the donor and its insertion site in the acceptor genome. DaisyGPS uses metagenomic profiling strategies to identify suitable acceptor and donor references. In contrast to previous approaches based on sequence composition patterns or phylogenetic disagreements, our methods provide a detection based on sequence comparison and hence offer novel means of evidence. In the last project, we present a method for HGT detection, called Hortense, that is based on proteomic MS data. Hortense extends a standard database peptide search with a thorough cross-validation to ensure HGT properties, and is the first dedicated proteomics HGT detection method. Results from Hortense can also serve as supporting evidence and functional confirmation for HGT events proposed by our genomic-based methods. Taken together, the three HGT methods provide a full view of the transfer event that was not be possible before or with one of the methods alone.},
 author = {Kathrin Trappe},
 month = {November},
 school = {Freie Universit{\"a}t Berlin},
 title = {Computational Methods for Integrative Structural Variant Analysis Across Species Boundaries},
 url = {http://publications.imp.fu-berlin.de/2541/},
 year = {2018}
}

2017

Enrique Audain, Julian Uszkoreit, Timo Sachsenberg, Julianus Pfeuffer, Xiao Liang, Henning Hermjakob, Aniel Sanchez, Martin Eisenacher, Knut Reinert, David L. Tabb, Oliver Kohlbacher, Yasset Perez-Riverol, “In-depth analysis of protein inference algorithms using multiple search engines and well-defined metrics”, vol. 150, 2017-01-06.

cite this publication

@article{fu_mi_publications1939,
 abstract = {In mass spectrometry-based shotgun proteomics, protein identifications are usually the desired result. However, most of the analytical methods are based on the identification of reliable peptides and not the direct identification of intact proteins. Thus, assembling peptides identified from tandem mass spectra into a list of proteins, referred to as protein inference, is a critical step in proteomics research. Currently, different protein inference algorithms and tools are available for the proteomics community. Here, we evaluated five software tools for protein inference (PIA, ProteinProphet, Fido, ProteinLP, MSBayesPro) using three popular database search engines: Mascot, X!Tandem, and MS-GF +. All the algorithms were evaluated using a highly customizable KNIME workflow using four different public datasets with varying complexities (different sample preparation, species and analytical instruments). We defined a set of quality control metrics to evaluate the performance of each combination of search engines, protein inference algorithm, and parameters on each dataset. We show that the results for complex samples vary not only regarding the actual numbers of reported protein groups but also concerning the actual composition of groups. Furthermore, the robustness of reported proteins when using databases of differing complexities is strongly dependant on the applied inference algorithm. Finally, merging the identifications of multiple search engines does not necessarily increase the number of reported proteins, but does increase the number of peptides per protein and thus can generally be recommended.},
 author = {Enrique Audain and Julian Uszkoreit and Timo Sachsenberg and Julianus Pfeuffer and Xiao Liang and Henning Hermjakob and Aniel Sanchez and Martin Eisenacher and Knut Reinert and David L. Tabb and Oliver Kohlbacher and Yasset Perez-Riverol},
 journal = {Journal of Proteomics},
 month = {January},
 pages = {170--182},
 publisher = {Elsevier},
 title = {In-depth analysis of protein inference algorithms using multiple search engines and well-defined metrics},
 url = {http://publications.imp.fu-berlin.de/1939/},
 volume = {150},
 year = {2017}
}

B. Vatansever, A. Muñoz, C. L. Klein, K. Reinert, “Development and optimisation of a generic micro LC-ESI-MS method for the qualitative and quantitative determination of 30-mer toxic gliadin peptides in wheat flour for food analysis”, vol. 409, iss. 4, 2017-02.

cite this publication

@article{fu_mi_publications1976,
 abstract = {We sometimes see manufactured bakery products on the market which are labelled as being gluten free. Why is the content of such gluten proteins of importance for the fabrication of bakery industry and for the products? The gluten proteins represent up to 80 \% of wheat proteins, and they are conventionally subdivided into gliadins and glutenins. Gliadins belong to the proline and glutamine-rich prolamin family. Its role in human gluten intolerance, as a consequence of its harmful effects, is well documented in the scientific literature. The only known therapy so far is a gluten-free diet, and hence, it is important to develop robust and reliable analytical methods to quantitatively assess the presence of the identified peptides causing the so-called coeliac disease. This work describes the development of a new, fast and robust micro ion pair-LC-MS analytical method for the qualitative and quantitative determination of 30-mer toxic gliadin peptides in wheat flour. The use of RapiGest? SF as a denaturation reagent prior to the enzymatic digestion showed to shorten the measuring time. During the optimisation of the enzymatic digestion step, the best 30-mer toxic peptide was identified from the maximum recovery after 3 h of digestion time. The lower limit of quantification was determined to be 0.25 ng/{\ensuremath{\mu}}L. The method has shown to be linear for the selected concentration range of 0.25?3.0 ng/{\ensuremath{\mu}}L. The uncertainty related to reproducibility of measurement procedure, excluding the extraction step, has shown to be 5.0 \% (N = 12). Finally, this method was successfully applied to the quantification of 30-mer toxic peptides from commercial wheat flour with an overall uncertainty under reproducibility conditions of 6.4 \% including the extraction of the gliadin fraction. The results were always expressed as the average of the values from all standard concentrations. Subsequently, the final concentration of the 30-mer toxic peptide in the flour was calculated and expressed in milligrams per gram unit. The determined, calculated concentration of the 30-mer toxic peptide in the flour was found to be 1.29 {$\pm$} 0.37 {\ensuremath{\mu}}g/g in flour (N = 25, sy = 545,075, f = 25 ? 2 (t = 2.069), P = 95 \%, two-sided).},
 author = {B. Vatansever and A. Mu{\~n}oz and C. L. Klein and K. Reinert},
 journal = {Analytical and Bioanalytical Chemistry},
 month = {February},
 number = {4},
 pages = {989--997},
 publisher = {Springer Berlin Heidelberg},
 title = {Development and optimisation of a generic micro LC-ESI-MS method for the qualitative and quantitative determination of 30-mer toxic gliadin peptides in wheat flour for food analysis},
 url = {http://publications.imp.fu-berlin.de/1976/},
 volume = {409},
 year = {2017}
}

cite this publication

@article{fu_mi_publications2103,
 abstract = {Background

The use of novel algorithmic techniques is pivotal to many important problems in life science. For example the sequencing of the human genome (Venter et al., 2001) would not have been possible without advanced assembly algorithms and the development of practical BWT based read mappers have been instrumental for NGS analysis. However, owing to the high speed of technological progress and the urgent need for bioinformatics tools, there was a widening gap between state-of-the-art algorithmic techniques and the actual algorithmic components of tools that are in widespread use. We previously addressed this by introducing the SeqAn library of efficient data types and algorithms in 2008 (D{\"o}ring et al., 2008).

Results

The SeqAn library has matured considerably since its first publication 9 years ago. In this article we review its status as an established resource for programmers in the field of sequence analysis and its contributions to many analysis tools.

Conclusions

We anticipate that SeqAn will continue to be a valuable resource, especially since it started to actively support various hardware acceleration techniques in a systematic manner.

Keywords

NGS analysis; Software libraries; C++; Data structures},
 author = {Knut Reinert and Temesgen Hailemariam Dadi and Marcel Ehrhardt and Hannes Hauswedell and Svenja Mehringer and Ren{\'e} Rahn and Jongkyu Kim and Christopher Pockrandt and J{\"o}rg Winkler and Enrico Siragusa and Gianvito Urgese and David Weese},
 journal = {Journal of Biotechnology},
 keywords = {NGS analysis; Software libraries; C++; Data structures},
 month = {November},
 pages = {157--168},
 publisher = {ELSEVIER},
 title = {The SeqAn C++ template library for efficient sequence analysis: A resource for programmers},
 url = {http://publications.imp.fu-berlin.de/2103/},
 volume = {261},
 year = {2017}
}

Julianus Pfeuffer, Timo Sachsenberg, Oliver Alka, Mathias Walzer, Alexander Fillbrunn, Lars Nilse, Oliver Schilling, Knut Reinert, Oliver Kohlbacher, “OpenMS – A platform for reproducible analysis of mass spectrometry data”, vol. 261, 2017-11-10.

cite this publication

@article{fu_mi_publications2116,
 abstract = {Background

In recent years, several mass spectrometry-based omics technologies emerged to investigate qualitative and quantitative changes within thousands of biologically active components such as proteins, lipids and metabolites. The research enabled through these methods potentially contributes to the diagnosis and pathophysiology of human diseases as well as to the clarification of structures and interactions between biomolecules. Simultaneously, technological advances in the field of mass spectrometry leading to an ever increasing amount of data, demand high standards in efficiency, accuracy and reproducibility of potential analysis software.

Results

This article presents the current state and ongoing developments in OpenMS, a versatile open-source framework aimed at enabling reproducible analyses of high-throughput mass spectrometry data. It provides implementations of frequently occurring processing operations on MS data through a clean application programming interface in C++ and Python. A collection of 185 tools and ready-made workflows for typical MS-based experiments enable convenient analyses for non-developers and facilitate reproducible research without losing flexibility.

Conclusions

OpenMS will continue to increase its ease of use for developers as well as users with improved continuous integration/deployment strategies, regular trainings with updated training materials and multiple sources of support. The active developer community ensures the incorporation of new features to support state of the art research.},
 author = {Julianus Pfeuffer and Timo Sachsenberg and Oliver Alka and Mathias Walzer and Alexander Fillbrunn and Lars Nilse and Oliver Schilling and Knut Reinert and Oliver Kohlbacher},
 journal = {Journal of Biotechnology},
 month = {November},
 pages = {142--148},
 publisher = {ELSEVIER},
 title = {OpenMS ? A platform for reproducible analysis of mass spectrometry data},
 url = {http://publications.imp.fu-berlin.de/2116/},
 volume = {261},
 year = {2017}
}

Johannes T. Roehr, Christoph Dieterich, Knut Reinert, “Flexbar 3.0 – SIMD and multicore parallelization”, vol. 33, iss. 18, 2017-09-15.

cite this publication

@article{fu_mi_publications2117,
 abstract = {Motivation: 
High-throughput sequencing machines can process many samples in a single run. For Illumina systems, sequencing reads are barcoded with an additional DNA tag that is contained in the respective sequencing adapters. The recognition of barcode and adapter sequences is hence commonly needed for the analysis of next-generation sequencing data. Flexbar performs demultiplexing based on barcodes and adapter trimming for such data. The massive amounts of data generated on modern sequencing machines demand that this preprocessing is done as efficiently as possible.

Results: 
We present Flexbar 3.0, the successor of the popular program Flexbar. It employs now twofold parallelism: multi-threading and additionally SIMD vectorization. Both types of parallelism are used to speed-up the computation of pair-wise sequence alignments, which are used for the detection of barcodes and adapters. Furthermore, new features were included to cover a wide range of applications. We evaluated the performance of Flexbar based on a simulated sequencing dataset. Our program outcompetes other tools in terms of speed and is among the best tools in the presented quality benchmark.

Availability and implementation:
https://github.com/seqan/flexbar

Contact:
johannes.roehr@fu-berlin.de or knut.reinert@fu-berlin.de},
 author = {Johannes T. Roehr and Christoph Dieterich and Knut Reinert},
 journal = {Bioinformatics},
 month = {September},
 number = {18},
 pages = {2941--2942},
 title = {Flexbar 3.0 ? SIMD and multicore parallelization},
 url = {http://publications.imp.fu-berlin.de/2117/},
 volume = {33},
 year = {2017}
}

Christopher Pockrandt, Marcel Ehrhardt, Knut Reinert, S. Sahinalp, “EPR-Dictionaries: A Practical and Fast Data Structure for Constant Time Searches in Unidirectional and Bidirectional FM Indices”, vol. 10229, p. 16, 2017-04-12.

cite this publication

@incollection{fu_mi_publications2118,
 abstract = {The unidirectional FM index was introduced by Ferragina and Manzini in 2000 and allows to search a pattern in the index in one direction. The bidirectional FM index (2FM) was introduced by Lam et al. in 2009. It allows to search for a pattern by extending an infix of the pattern arbitrarily to the left or right. If {\ensuremath{\sigma}} is the size of the alphabet then the method of Lam et al. can conduct one step in time O({\ensuremath{\sigma}}) while needing space O({\ensuremath{\sigma}}{$\cdot$}n) using constant time rank queries on bit vectors. Schnattinger and colleagues improved this time to O(log{\ensuremath{\sigma}}) while using O(log{\ensuremath{\sigma}}{$\cdot$}n) bits of space for both, the FM and 2FM index. This is achieved by the use of binary wavelet trees.

In this paper we introduce a new, practical method for conducting an exact search in a uni- and bidirectional FM index in O(1) time per step while using O(log{\ensuremath{\sigma}}{$\cdot$}n)+o(log{\ensuremath{\sigma}}{$\cdot$}{\ensuremath{\sigma}}{$\cdot$}n) 
bits of space. This is done by replacing the binary wavelet tree by a new data structure, the Enhanced Prefixsum Rank dictionary (EPR-dictionary).

We implemented this method in the SeqAn C++ library and experimentally validated our theoretical results. In addition we compared our implementation with other freely available implementations of bidirectional indices and show that we are between {$\approx$}2.2?4.2 times faster. This will have a large impact for many bioinformatics applications that rely on practical implementations of (2)FM indices e.g. for read mapping. To our knowledge this is the first implementation of a constant time method for a search step in 2FM indices.},
 author = {Christopher Pockrandt and Marcel Ehrhardt and Knut Reinert},
 booktitle = {Research in Computational Molecular Biology. RECOMB 2017},
 editor = {S. Sahinalp},
 month = {April},
 pages = {190--206},
 publisher = {Springer, Cham},
 series = {Lecture Notes in Computer Science (LNCS)},
 title = {EPR-Dictionaries: A Practical and Fast Data Structure for Constant Time Searches in Unidirectional and Bidirectional FM Indices},
 url = {http://publications.imp.fu-berlin.de/2118/},
 volume = {10229},
 year = {2017}
}

Temesgen Hailemariam Dadi, Bernhard Y. Renard, Lothar H. Wieler, Torsten Semmler, Knut Reinert, “SLIMM: species level identification of microorganisms from metagenomes”, vol. 5, 2017-03-28.

cite this publication

@article{fu_mi_publications2119,
 abstract = {Identification and quantification of microorganisms is a significant step in studying the alpha and beta diversities within and between microbial communities respectively. Both identification and quantification of a given microbial community can be carried out using whole genome shotgun sequences with less bias than when using 16S-rDNA sequences. However, shared regions of DNA among reference genomes and taxonomic units pose a significant challenge in assigning reads correctly to their true origins. The existing microbial community profiling tools commonly deal with this problem by either preparing signature-based unique references or assigning an ambiguous read to its least common ancestor in a taxonomic tree. The former method is limited to making use of the reads which can be mapped to the curated regions, while the latter suffer from the lack of uniquely mapped reads at lower (more specific) taxonomic ranks. Moreover, even if the tools exhibited good performance in calling the organisms present in a sample, there is still room for improvement in determining the correct relative abundance of the organisms. We present a new method Species Level Identification of Microorganisms from Metagenomes (SLIMM) which addresses the above issues by using coverage information of reference genomes to remove unlikely genomes from the analysis and subsequently gain more uniquely mapped reads to assign at lower ranks of a taxonomic tree. SLIMM is based on a few, seemingly easy steps which when combined create a tool that outperforms state-of-the-art tools in run-time and memory usage while being on par or better in computing quantitative and qualitative information at species-level.},
 author = {Temesgen Hailemariam Dadi and Bernhard Y. Renard and Lothar H. Wieler and Torsten Semmler and Knut Reinert},
 journal = {PeerJ},
 month = {March},
 pages = {e3138},
 title = {SLIMM: species level identification of microorganisms from metagenomes},
 url = {http://publications.imp.fu-berlin.de/2119/},
 volume = {5},
 year = {2017}
}

Russell Schwartz, Knut Reinert, “17th International Workshop on Algorithms in Bioinformatics (WABI 2017)”, vol. 88, p. 394, 2017-08.

cite this publication

@book{fu_mi_publications2132,
 abstract = {This proceedings volume contains papers presented at the 17th Workshop on Algorithms in Bioinformatics (WABI 2017), which was held in Boston, MA, USA in conjunction with the 8th ACM Conference on Bioinformatics, Computational Biology, and Health Informatics (ACM BCB) from August 21?23, 2017.
The Workshop on Algorithms in Bioinformatics is an annual conference established in 2001 to cover all aspects of algorithmic work in bioinformatics, computational biology, and systems biology. The workshop is intended as a forum for discrete algorithms and machine-learning methods that address important problems in molecular biology, that are founded on sound models, that are computationally efficient, and that have been implemented and tested in simulations and on real datasets. The meeting?s focus is on recent research results, including significant work-in-progress, as well as identifying and explore directions of future research.
WABI 2017 is grateful for the support of ACM-BCB in allowing us to cohost the meetings, as well as to ACM-BCB?s sponsors: the Association for Computing Machinery (ACM) and ACM?s SIGBIO.
In 2017, a total of 55 manuscripts were submitted to WABI from which 27 were selected for presentation at the conference. This year, WABI is adopting a new proceedings form, publishing the conference proceedings through the LIPIcs (Leibniz International Proceedings in Informatics) proceedings series. Extended versions of selected papers will be invited for publication in a thematic series in the journal Algorithms for Molecular Biology (AMB), published by BioMed Central.
The 27 papers were selected based on a thorough peer review, involving at least three independent reviewers per submitted paper, followed by discussions among the WABI Program Committee members. The selected papers cover a wide range of topics, including statistical inference, phylogenetic studies, sequence and genome analysis, comparative genomics, and mass spectrometry data analysis.
We thank all the authors of submitted papers and the members of the WABI Program Committee and their reviewers for their efforts that made this conference possible. We are also grateful to the WABI Steering Committee for their help and advice. We also thank all the conference participants and speakers who contribute to a great scientific program. In
particular, we are indebted to the keynote speaker of the conference, Tandy Warnow, for her presentation. We also thank Christopher Pockrandt for setting up the WABI webpage
and Umit Acar for his help with coordinating the WABI and ACM-BCB pages. Finally, we thank the ACM-BCB Organizing Committee, especially Nurit Haspel and Lenore Cowen,
for their hard work in making all of the local arrangements and working closely with us to ensure a successful and exciting WABI and ACM-BCB.},
 address = {Saarbr{\"u}cken/Wadern},
 editor = {Russell Schwartz and Knut Reinert},
 month = {August},
 publisher = {Dagstuhl LIPIcs},
 series = {LIPICS},
 title = {17th International Workshop on Algorithms in Bioinformatics (WABI 2017)},
 url = {http://publications.imp.fu-berlin.de/2132/},
 volume = {88},
 year = {2017}
}

Jongkyu Kim, Knut Reinert, Russell Schwartz, Knut Reinert, “Vaquita: Fast and Accurate Identification of Structural Variation Using Combined Evidence”, iss. 88, p. 14, 2017-08.

cite this publication

@incollection{fu_mi_publications2133,
 abstract = {Motivation: 
Comprehensive identification of structural variations (SVs) is a crucial task for studying genetic diversity and diseases. However, it remains challenging. There is only a marginal consensus between different methods, and our understanding of SVs is substantially limited.In general, integration of multiple pieces of evidence including split-read, read-pair, soft-clip, and read-depth yields the best result regarding accuracy. However, doing this step by step is usually cumbersome and computationally expensive. 
Result: 
We present Vaquita, an accurate and fast tool for the identification of structural variations, which leverages all four types of evidence in a single program. After merging SVs from split-reads and discordant read-pairs, Vaquita realigns the soft-clipped reads to the selected regions using a fast bit-vector algorithm. Furthermore, it also considers the discrepancy of depth distribution around breakpoints using Kullback-Leibler divergence. Finally, Vaquita provides an additional metric for candidate selection based on voting, and also provides robust prioritization based on rank aggregation. We show that Vaquita is robust in terms of sequencing coverage, insertion size of the library, and read length, and is comparable or even better for the identification of deletions, inversions, duplications, and translocations than state-of-the-art tools, using both simulated and real datasets. In addition, Vaquita is more than eight times faster than any other tools in comparison. 
Availability: 
Vaquita is implemented in C++ using the SeqAn library. The source code is distributed under the BSD license and can be downloaded at http://github.com/seqan/vaquita},
 address = {Saarbr{\"u}cken/Wadern},
 author = {Jongkyu Kim and Knut Reinert},
 booktitle = {17th International Workshop on Algorithms in Bioinformatics (WABI 2017)},
 editor = {Russell Schwartz and Knut Reinert},
 month = {August},
 number = {88},
 pages = {185(13:1)--198(13:14)},
 publisher = {Dagstuhl LIPIcs},
 series = {LIPICS},
 title = {Vaquita: Fast and Accurate Identification of Structural Variation Using Combined Evidence},
 url = {http://publications.imp.fu-berlin.de/2133/},
 year = {2017}
}

Gene Meyers, Mihai Pop, Knut Reinert, Tandy Warnow, “Dagstuhl Reports, Vol. 6, No. 8, pp. 91-130: Next Generation Sequencing (Dagstuhl Seminar 16351)”, p. 40, 2017.

cite this publication

@manual{fu_mi_publications2134,
 abstract = {Next Generation Sequencing (NGS) data have begun to appear in many applications that are clinically relevant, such as resequencing of cancer patients, disease-gene discovery and diagnostics for rare diseases, microbiome analyses, and gene expression profiling. The analysis of sequencing data is demanding because of the enormous data volume and the need for fast turnaround time, accuracy, reproducibility, and data security. This Dagstuhl Seminar aimed at a free and deep exchange of ideas and needs between the communities of algorithmicists and theoreticians and practitioners from the biomedical field. It identified several relevant fields such as data structures and algorithms for large data sets, hardware acceleration, new problems in the upcoming age of genomes, etc. which were discussed in breakout groups.},
 address = {Dagstuhl, Germany},
 author = {Gene Meyers and Mihai Pop and Knut Reinert and Tandy Warnow},
 number = {DOI: 10.4230/DagRep.6.8.91},
 publisher = {Schloss Dagstuhl--Leibniz-Zentrum fuer Informatik},
 title = {Dagstuhl Reports, Vol. 6, No. 8, pp. 91-130: Next Generation Sequencing (Dagstuhl Seminar 16351)},
 type = {Documentation},
 url = {http://publications.imp.fu-berlin.de/2134/},
 year = {2017}
}

2016

S. Canzar, S. Andreotti, D. Weese, K. Reinert, G. W. Klau, “CIDANE: comprehensive isoform discovery and abundance estimation”, vol. 17, iss. 1, 2016-01-30.

cite this publication

@article{fu_mi_publications1830,
 abstract = {We present CIDANE, a novel framework for genome-based transcript reconstruction and quantification from RNA-seq reads. CIDANE assembles transcripts efficiently with significantly higher sensitivity and precision than existing tools. Its algorithmic core not only reconstructs transcripts ab initio, but also allows the use of the growing annotation of known splice sites, transcription start and end sites, or full-length transcripts, which are available for most model organisms. CIDANE supports the integrated analysis of RNA-seq and additional gene-boundary data and recovers splice junctions that are invisible to other methods. CIDANE is available at http://?ccb.?jhu.?edu/?software/?cidane/?.},
 author = {S. Canzar and S. Andreotti and D. Weese and K. Reinert and G. W. Klau},
 journal = {Genome Biology},
 month = {January},
 number = {1},
 publisher = {BioMed Central, Springer Science+Business Media},
 title = {CIDANE: comprehensive isoform discovery and abundance estimation},
 url = {http://publications.imp.fu-berlin.de/1830/},
 volume = {17},
 year = {2016}
}

Marten Jäger, Max Schubach, Tomasz Zemojtel, Knut Reinert, Deanna M. Church, Peter N. Robinson, “Alternate-locus aware variant calling in whole genome sequencing”, vol. 8, iss. 1, 2016-12-13.

cite this publication

@article{fu_mi_publications2004,
 abstract = {
Background

The last two human genome assemblies have extended the previous linear golden-path paradigm of the human genome to a graph-like model to better represent regions with a high degree of structural variability. The new model offers opportunities to improve the technical validity of variant calling in whole-genome sequencing (WGS).
Methods

We developed an algorithm that analyzes the patterns of variant calls in the 178 structurally variable regions of the GRCh38 genome assembly, and infers whether a given sample is most likely to contain sequences from the primary assembly, an alternate locus, or their heterozygous combination at each of these 178 regions. We investigate 121 in-house WGS datasets that have been aligned to the GRCh37 and GRCh38 assemblies.

Results

We show that stretches of sequences that are largely but not entirely identical between the primary assembly and an alternate locus can result in multiple variant calls against regions of the primary assembly. In WGS analysis, this results in characteristic and recognizable patterns of variant calls at positions that we term alignable scaffold-discrepant positions (ASDPs). In 121 in-house genomes, on average 51.8{$\pm$}3.8 of the 178 regions were found to correspond best to an alternate locus rather than the primary assembly sequence, and filtering these genomes with our algorithm led to the identification of 7863 variant calls per genome that colocalized with ASDPs. Additionally, we found that 437 of 791 genome-wide association study hits located within one of the regions corresponded to ASDPs.

Conclusions

Our algorithm uses the information contained in the 178 structurally variable regions of the GRCh38 genome assembly to avoid spurious variant calls in cases where samples contain an alternate locus rather than the corresponding segment of the primary assembly. These results suggest the great potential of fully incorporating the resources of graph-like genome assemblies into variant calling, but also underscore the importance of developing computational resources that will allow a full reconstruction of the genotype in personal genomes. Our algorithm is freely available at https://github.com/charite/asdpex.},
 author = {Marten J{\"a}ger and Max Schubach and Tomasz Zemojtel and Knut Reinert and Deanna M. Church and Peter N. Robinson},
 journal = {Genome Medicine},
 month = {December},
 number = {1},
 publisher = {BioMed Central (Springer Nature)},
 title = {Alternate-locus aware variant calling in whole genome sequencing},
 url = {http://publications.imp.fu-berlin.de/2004/},
 volume = {8},
 year = {2016}
}

Martin Zühlke, Daniel Riebe, Toralf Beitz, Hans-Gerd Löhmannsröben, Sandro Andreotti, Knut Reinert, Karl Zenichowski, Marc Diener, “High-performance liquid chromatography with electrospray ionization ion mobility spectrometry: Characterization, data management, and applications”, vol. 39, iss. 24, 2016-11-28.

cite this publication

@article{fu_mi_publications2128,
 abstract = {The combination of high-performance liquid chromatography and electrospray ionization ion mobility spectrometry facilitates the two-dimensional separation of complex mixtures in the retention and drift time plane. The ion mobility spectrometer presented here was optimized for flow rates customarily used in high-performance liquid chromatography between 100 and 1500 {\ensuremath{\mu}}L/min. The characterization of the system with respect to such parameters as the peak capacity of each time dimension and of the 2D spectrum was carried out based on a separation of a pesticide mixture containing 24 substances. While the total ion current chromatogram is coarsely resolved, exhibiting coelutions for a number of compounds, all substances can be separately detected in the 2D plane due to the orthogonality of the separations in retention and drift dimensions. Another major advantage of the ion mobility detector is the identification of substances based on their characteristic mobilities. Electrospray ionization allows the detection of substances lacking a chromophore. As an example, the separation of a mixture of 18 amino acids is presented. A software built upon the free mass spectrometry package OpenMS was developed for processing the extensive 2D data. The different processing steps are implemented as separate modules which can be arranged in a graphic workflow facilitating automated processing of data.},
 author = {Martin Z{\"u}hlke and Daniel Riebe and Toralf Beitz and Hans-Gerd L{\"o}hmannsr{\"o}ben and Sandro Andreotti and Knut Reinert and Karl Zenichowski and Marc Diener},
 journal = {Journal of Separation Science},
 month = {November},
 number = {24},
 pages = {4756--4764},
 publisher = {Wiley},
 title = {High-performance liquid chromatography with electrospray ionization ion mobility spectrometry: Characterization, data management, and applications},
 url = {http://publications.imp.fu-berlin.de/2128/},
 volume = {39},
 year = {2016}
}

Hannes L Röst, Timo Sachsenberg, Stephan Aiche, Chris Bielow, Hendrik Weisser, Fabian Aicheler, Sandro Andreotti, Hans-Christian Ehrlich, Petra Gutenbrunner, Erhan Kenar, Xiao Liang, Sven Nahnsen, Lars Nilse, Julianus Pfeuffer, George Rosenberger, Marc Rurik, Uwe Schmitt, Johannes Veit, Mathias Walzer, David Wojnar, Witold E Wolski, Oliver Schilling, Jyoti S Choudhary, Lars Malmström, Ruedi Aebersold, Knut Reinert, Oliver Kohlbacher, “OpenMS: a flexible open-source software platform for mass spectrometry data analysis”, vol. 13, iss. 9, 2016-08-30.

cite this publication

@article{fu_mi_publications2129,
 abstract = {High-resolution mass spectrometry (MS) has become an important tool in the life sciences, contributing to the diagnosis and understanding of human diseases, elucidating biomolecular structural information and characterizing cellular signaling networks. However, the rapid growth in the volume and complexity of MS data makes transparent, accurate and reproducible analysis difficult. We present OpenMS 2.0 (http://www.openms.de), a robust, open-source, cross-platform software specifically designed for the flexible and reproducible analysis of high-throughput MS data. The extensible OpenMS software implements common mass spectrometric data processing tasks through a well-defined application programming interface in C++ and Python and through standardized open data formats. OpenMS additionally provides a set of 185 tools and ready-made workflows for common mass spectrometric data processing tasks, which enable users to perform complex quantitative mass spectrometric analyses with ease.},
 author = {Hannes L R{\"o}st and Timo Sachsenberg and Stephan Aiche and Chris Bielow and Hendrik Weisser and Fabian Aicheler and Sandro Andreotti and Hans-Christian Ehrlich and Petra Gutenbrunner and Erhan Kenar and Xiao Liang and Sven Nahnsen and Lars Nilse and Julianus Pfeuffer and George Rosenberger and Marc Rurik and Uwe Schmitt and Johannes Veit and Mathias Walzer and David Wojnar and Witold E Wolski and Oliver Schilling and Jyoti S Choudhary and Lars Malmstr{\"o}m and Ruedi Aebersold and Knut Reinert and Oliver Kohlbacher},
 journal = {Nature Methods},
 month = {August},
 number = {9},
 pages = {741--748},
 publisher = {Springer Nature/Macmillan Publishers Limited},
 title = {OpenMS: a flexible open-source software platform for mass spectrometry data analysis},
 url = {http://publications.imp.fu-berlin.de/2129/},
 volume = {13},
 year = {2016}
}

Luis de la Garza, Johannes Veit, Andras Szolek, Marc Röttig, Stephan Aiche, Sandra Gesing, Knut Reinert, Oliver Kohlbacher, “From the desktop to the grid: scalable bioinformatics via workflow conversion”, vol. 17, iss. 1, 2016-03-12.

cite this publication

@article{fu_mi_publications2130,
 abstract = {Background

Reproducibility is one of the tenets of the scientific method. Scientific experiments often comprise complex data flows, selection of adequate parameters, and analysis and visualization of intermediate and end results. Breaking down the complexity of such experiments into the joint collaboration of small, repeatable, well defined tasks, each with well defined inputs, parameters, and outputs, offers the immediate benefit of identifying bottlenecks, pinpoint sections which could benefit from parallelization, among others. Workflows rest upon the notion of splitting complex work into the joint effort of several manageable tasks.

There are several engines that give users the ability to design and execute workflows. Each engine was created to address certain problems of a specific community, therefore each one has its advantages and shortcomings. Furthermore, not all features of all workflow engines are royalty-free {--}an aspect that could potentially drive away members of the scientific community.

Results

We have developed a set of tools that enables the scientific community to benefit from workflow interoperability. We developed a platform-free structured representation of parameters, inputs, outputs of command-line tools in so-called Common Tool Descriptor documents. We have also overcome the shortcomings and combined the features of two royalty-free workflow engines with a substantial user community: the Konstanz Information Miner, an engine which we see as a formidable workflow editor, and the Grid and User Support Environment, a web-based framework able to interact with several high-performance computing resources. We have thus created a free and highly accessible way to design workflows on a desktop computer and execute them on high-performance computing resources.

Conclusions

Our work will not only reduce time spent on designing scientific workflows, but also make executing workflows on remote high-performance computing resources more accessible to technically inexperienced users. We strongly believe that our efforts not only decrease the turnaround time to obtain scientific results but also have a positive impact on reproducibility, thus elevating the quality of obtained scientific results.},
 author = {Luis de la Garza and Johannes Veit and Andras Szolek and Marc R{\"o}ttig and Stephan Aiche and Sandra Gesing and Knut Reinert and Oliver Kohlbacher},
 journal = {BMC Bioinformatics},
 month = {March},
 number = {1},
 publisher = {Springer Nature},
 title = {From the desktop to the grid: scalable bioinformatics via workflow conversion},
 url = {http://publications.imp.fu-berlin.de/2130/},
 volume = {17},
 year = {2016}
}

Konstantin Okonechnikov, “High-throughput RNA sequencing: a step forward in transcriptome analysis”, 2016-02-25.

cite this publication

@phdthesis{fu_mi_publications2529,
 abstract = {The transcriptome plays an important role in the life of a cell. Detailed analysis of the transcriptome enables interpretation of its structure and functionality. High throughput sequencing technology significantly enhanced the understanding of transcriptome activity. The RNA-sequencing process currently provides the most accurate estimation of gene expression levels. Moreover, RNA-seq allows detection of isoform structure and novel RNA types along with transcription process details such as strand-specificity and much more. The first chapter of this thesis describes the history of transcriptome exploration and effective methods of RNA-seq application. Nevertheless, all steps of RNA-seq process can produce a number of biases that influence the investigation results. Some typical errors appearing during ligation and amplification procedures might be present in any high throughput sequencing experiment, while other biases occur only in cDNA synthesis or are specific for transcriptome activity. Quality control of sequencing data is important to verify and correct the analysis results. The second chapter of this thesis is devoted to the explanation of these issues and introduces a novel tool, Qualimap 2. This instrument computes detailed statistics and presents a number of plots based on RNA-seq alignment and counts data processing. The generated results enable detection of problems that are specific to RNA-seq experiments. Notably, the tool supports analysis of multiple samples in various conditions. Qualimap 2 was faithfully compared to other available tools and demonstrated superior functionality in multi-sample quality control. Importantly, RNA-seq can be applied in a relatively novel research area: detection of chimeric transcripts and fusion genes occurring due to genomic rearrangement. Since fusions are related to cancer, their discovery is important not only for science, but also allows medical use of RNA-seq. The third chapter is devoted to the current status of this approach and illustrates a novel toolkit called InFusion, which provides a number of novelties in chimera discovery from RNA- seq data such as detection of fusions arising from the combination of a gene and an intronic or intergenic region. Moreover, strand-specificity of expressed fusion transcripts can be detected and reported. InFusion was compared in detail to a number of other existing tools based on simulated and real datasets and demonstrated higher precision and recall. Overall, RNA- sequencing technology goes further and more specialized analysis abilities are becoming available. New applications of RNA sequencing and future directions of research are discussed in the last chapter.},
 author = {Konstantin Okonechnikov},
 month = {February},
 school = {Freie Universit{\"a}t Berlin},
 title = {High-throughput RNA sequencing: a step forward in transcriptome analysis},
 url = {http://publications.imp.fu-berlin.de/2529/},
 year = {2016}
}

2015

J. Hu, K. Reinert, “LocalAli: An Evolutionary-based Local Alignment Approach to Identify Functionally Conserved Modules in Multiple Networks.”, vol. 30, iss. 1, 2015.

cite this publication

@article{fu_mi_publications1457,
 abstract = {MOTIVATION:Sequences and protein interaction data are of significance to understand the underlying molecular mechanism of organisms. Local network alignment is one of key systematic ways for predicting protein functions, identifying functional modules, and understanding the phylogeny from these data. Most of currently existing tools, however, encounter their limitations which are mainly concerned with scoring scheme, speed and scalability. Therefore, there are growing demands for sophisticated network evolution models and efficient local alignment algorithms.
RESULTS:We developed a fast and scalable local network alignment tool so-called LocalAli for the identification of functionally conserved modules in multiple networks. In this algorithm, we firstly proposed a new framework to reconstruct the evolution history of conserved modules based on a maximum-parsimony evolutionary model. By relying on this model, LocalAli facilitates interpretation of resulting local alignments in terms of conserved modules which have been evolved from a common ancestral module through a series of evolutionary events. A meta-heuristic method simulated annealing was used to search for the optimal or near-optimal inner nodes (i.e. ancestral modules) of the evolutionary tree. To evaluate the performance and the statistical significance, LocalAli were tested on a total of 26 real datasets and 1040 randomly generated datasets. The results suggest that LocalAli outperforms all existing algorithms in terms of coverage, consistency and scalability, meanwhile retains a high precision in the identification of functionally coherent subnetworks.

AVAILABILITY:The source code and test datasets are freely available for download under the GNU GPL v3 license at https://code.google.com/p/localali/.

CONTACT:jialu.hu@fu-berlin.de or knut.reinert@fu-berlin.de.},
 author = {J. Hu and K. Reinert},
 journal = {Bioinformatics},
 number = {1},
 pages = {363--372},
 publisher = {Oxford University Press},
 title = {LocalAli: An Evolutionary-based Local Alignment Approach to Identify Functionally Conserved Modules in Multiple Networks.},
 url = {http://publications.imp.fu-berlin.de/1457/},
 volume = {30},
 year = {2015}
}

Anja Wilmes, Chris Bielow, Christina Ranninger, Patricia Bellwon, Lydia Aschauer, Alice Limonciel, Hubert Chassaigne, Theresa Kristl, Stephan Aiche, Christian G. Huber, Claude Guillou, Philipp Hewitt, Martin O. Leonard, Wolfgang Dekant, Frederic Bois, Paul Jennings, “Mechanism of cisplatin proximal tubule toxicity revealed by integrating transcriptomics, proteomics, metabolomics and biokinetics”, vol. 30, iss. 1, Part A, 2015-12-25.

cite this publication

@article{fu_mi_publications1488,
 abstract = {Cisplatin is one of the most widely used chemotherapeutic agents for the treatment of solid tumours. The major dose-limiting factor is nephrotoxicity, in particular in the proximal tubule. Here, we use an integrated omics approach, including transcriptomics, proteomics and metabolomics coupled to biokinetics to identify cell stress response pathways induced by cisplatin. The human renal proximal tubular cell line RPTEC/TERT1 was treated with sub-cytotoxic concentrations of cisplatin (0.5 and 2{\ensuremath{\mu}}M) in a daily repeat dose treating regime for up to 14days. Biokinetic analysis showed that cisplatin was taken up from the basolateral compartment, transported to the apical compartment, and accumulated in cells over time. This is in line with basolateral uptake of cisplatin via organic cation transporter 2 and bioactivation via gamma-glutamyl transpeptidase located on the apical side of proximal tubular cells. Cisplatin affected several pathways including, p53 signalling, Nrf2 mediated oxidative stress response, mitochondrial processes, mTOR and AMPK signalling. In addition, we identified novel pathways changed by cisplatin, including eIF2 signalling, actin nucleation via the ARP/WASP complex and regulation of cell polarization. In conclusion, using an integrated omic approach together with biokinetics we have identified both novel and established mechanisms of cisplatin toxicity.},
 author = {Anja Wilmes and Chris Bielow and Christina Ranninger and Patricia Bellwon and Lydia Aschauer and Alice Limonciel and Hubert Chassaigne and Theresa Kristl and Stephan Aiche and Christian G. Huber and Claude Guillou and Philipp Hewitt and Martin O. Leonard and Wolfgang Dekant and Frederic Bois and Paul Jennings},
 journal = {Toxicology in Vitro},
 month = {December},
 number = {1, Part A},
 pages = {117--127},
 publisher = {Elsevier B.V.},
 title = {Mechanism of cisplatin proximal tubule toxicity revealed by integrating transcriptomics, proteomics, metabolomics and biokinetics},
 url = {http://publications.imp.fu-berlin.de/1488/},
 volume = {30},
 year = {2015}
}

S. Aiche, T. Sachsenberg, E. Kenar, M. Walzer, B. Wiswedel, T. Kristl, M. Boyles, A. Duschl, C. G. Huber, M. R. Berthold, K. Reinert, O. Kohlbacher, “Workflows for automated downstream data analysis and visualization in large-scale computational mass spectrometry”, vol. 15, iss. 8, 2015-04.

cite this publication

@article{fu_mi_publications1505,
 abstract = {MS-based proteomics and metabolomics are rapidly evolving research fields driven by the development of novel instruments, experimental approaches, and analysis methods. Monolithic analysis tools perform well on single tasks but lack the flexibility to cope with the constantly changing requirements and experimental setups. Workflow systems, which combine small processing tools into complex analysis pipelines, allow custom-tailored and flexible data-processing workflows that can be published or shared with collaborators. In this article, we present the integration of established tools for computational MS from the open-source software framework OpenMS into the workflow engine Konstanz Information Miner (KNIME) for the analysis of large datasets and production of high-quality visualizations. We provide example workflows to demonstrate combined data processing and visualization for three diverse tasks in computational MS: isobaric mass tag based quantitation in complex experimental setups, label-free quantitation and identification of metabolites, and quality control for proteomics experiments.},
 author = {S. Aiche and T. Sachsenberg and E. Kenar and M. Walzer and B. Wiswedel and T. Kristl and M. Boyles and A. Duschl and C. G. Huber and M. R. Berthold and K. Reinert and O. Kohlbacher},
 journal = {PROTEOMICS},
 month = {April},
 number = {8},
 pages = {1443--1447},
 publisher = {Wiley VCH},
 title = {Workflows for automated downstream data analysis and visualization in large-scale computational mass spectrometry},
 url = {http://publications.imp.fu-berlin.de/1505/},
 volume = {15},
 year = {2015}
}

M. Holtgrewe, L. Kuchenbecker, K. Reinert, “Methods for the Detection and Assembly of Novel Sequence in High-Throughput Sequencing Data”, vol. 31, iss. 12, 2015.

cite this publication

@article{fu_mi_publications1506,
 abstract = {Motivation: 
Large insertions of novel sequence are an important type of structural variants. Previous studies used traditional de novo assemblers for assembling non-mapping high-throughput sequencing (HTS) or capillary reads and then tried to anchor them in the reference using paired read information.

Results: 
We present approaches for detecting insertion breakpoints and targeted assembly of large insertions from HTS paired data: BASIL and ANISE. On near identity repeats that are hard for assemblers, ANISE employs a repeat resolution step. This results in far better reconstructions than obtained by the compared methods. On simulated data, we found our insert assembler to be competitive with the de novo assemblers ABYSS and SGA while yielding already anchored inserted sequence as opposed to unanchored contigs as from ABYSS/SGA. On real-world data, we detected novel sequence in a human individual and thoroughly validated the assembled sequence. ANISE was found to be superior to the competing tool MindTheGap on both simulated and real-world data.

Availability and implementation: ANISE and BASIL are available for download at http://www.seqan.de/projects/herbarium under a permissive open source license. 

Contact: manuel.holtgrewe@fu-berlin.de or knut.reinert@fu-berlin.de},
 author = {M. Holtgrewe and L. Kuchenbecker and K. Reinert},
 journal = {Bioinformatics},
 number = {12},
 pages = {1904--1912},
 title = {Methods for the Detection and Assembly of Novel Sequence in High-Throughput Sequencing Data},
 url = {http://publications.imp.fu-berlin.de/1506/},
 volume = {31},
 year = {2015}
}

K. Reinert, B. Langmead, D. Weese, D.J. Evers, “Alignment of Next-Generation Sequencing Reads”, vol. 16, iss. 1, 2015-05-04.

cite this publication

@article{fu_mi_publications1544,
 abstract = {High-throughput DNA sequencing has considerably changed the possibilities for conducting biomedical research by measuring billions of short DNA or RNA fragments. A central computational problem, and for many applications a first step, consists of determining where the fragments came from
in the original genome. In this article, we review the main techniques for generating the fragments, the main applications, and the main algorithmic ideas for computing a solution to the read alignment problem. In addition, we describe pitfalls and difficulties connected to determining the correct positions of reads.},
 author = {K. Reinert and B. Langmead and D. Weese and D.J. Evers},
 journal = {Annual Review of Genomics and Human Genetics},
 month = {May},
 number = {1},
 pages = {133--151},
 title = {Alignment of Next-Generation Sequencing Reads},
 url = {http://publications.imp.fu-berlin.de/1544/},
 volume = {16},
 year = {2015}
}

L. Kuchenbecker, M. Nienen, J. Hecht, A. U. Neumann, N. Babel, K. Reinert, P. N. Robinson, “IMSEQ - a fast and error aware approach to immunogenetic sequence analysis”, vol. 31, iss. 18, 2015.

cite this publication

@article{fu_mi_publications1551,
 abstract = {Motivation: Recombined T and B cell receptor repertoires are increasingly being studied using next generation sequencing (NGS) in order to interrogate the repertoire composition as well as changes in the distribution of receptor clones under different physiological and disease states. This type of analysis requires efficient and unambiguous clonotype assignment to a large number of NGS read sequences, including the identification of the incorporated V and J gene segments and the CDR3 sequence. Current tools have deficits with respect to performance, accuracy and documentation of their underlying algorithms and usage.

Results: We present IMSEQ, a method to derive clonotype repertoires from next generation sequencing data with sophisticated routines for handling errors stemming from PCR and sequencing artefacts. The application can handle different kinds of input data originating from single- or paired-end sequencing in different configurations and is generic regarding the species and gene of interest. We have carefully evaluated our method with simulated and real world data and show that IMSEQ is superior to other tools with respect to its clonotyping as well as standalone error correction and runtime performance.

Availability: IMSEQ was implemented in C++ using the SeqAn library for efficient sequence analysis. It is freely available under the GPLv2 open source license and can be downloaded at www.imtools.org. },
 author = {L. Kuchenbecker and M. Nienen and J. Hecht and A. U. Neumann and N. Babel and K. Reinert and P. N. Robinson},
 journal = {Bioinformatics},
 number = {18},
 pages = {2963--2971},
 publisher = {Oxford University Press (online advanced access)},
 title = {IMSEQ - a fast and error aware approach to immunogenetic sequence analysis},
 url = {http://publications.imp.fu-berlin.de/1551/},
 volume = {31},
 year = {2015}
}

L. Schultz, M.-G. Zurich, M. Culot, A. da Costa, C. Landry, P. Bellwon, T. Kristl, K. Hörmann, S. Ruzek, S. Aiche, K. Reinert, C. Bielow, F. Gosselet, R. Cecchelli, C. G. Huber, O. H.-U. Schroeder, A. Gramowski-Voss, D. G. Weiss, A. Bal-Price, “Evaluation of drug-induced neurotoxicity based on metabolomics, proteomics and electrical activity measurements in complementary CNS in vitro models”, vol. 30, iss. 1, 2015-12-25.

cite this publication

@article{fu_mi_publications1736,
 abstract = {The present study was performed in an attempt to develop an in vitro integrated testing strategy (ITS) to evaluate drug-induced neurotoxicity. A number of endpoints were analyzed using two complementary brain cell culture models and an in vitro blood?brain barrier (BBB) model after single and repeated exposure treatments with selected drugs that covered the major biological, pharmacological and neuro-toxicological responses. Furthermore, four drugs (diazepam, cyclosporine A, chlorpromazine and amiodarone) were tested more in depth as representatives of different classes of neurotoxicants, inducing toxicity through different pathways of toxicity.

The developed in vitro BBB model allowed detection of toxic effects at the level of BBB and evaluation of drug transport through the barrier for predicting free brain concentrations of the studied drugs. The measurement of neuronal electrical activity was found to be a sensitive tool to predict the neuroactivity and neurotoxicity of drugs after acute exposure. The histotypic 3D re-aggregating brain cell cultures, containing all brain cell types, were found to be well suited for OMICs analyses after both acute and long term treatment.

The obtained data suggest that an in vitro ITS based on the information obtained from BBB studies and combined with metabolomics, proteomics and neuronal electrical activity measurements performed in stable in vitro neuronal cell culture systems, has high potential to improve current in vitro drug-induced neurotoxicity evaluation.

Abbreviations:
BBB, blood brain barrier; DMSO, dimethylsulfoxide; EC, endothelial cells; DIV, day in vitro; IPA, Ingenuity Pathway Analysis; ITS, integrated testing strategy; LY, Lucifer Yellow; MEA, micro-electrode array; RH, Ringer HEPES medium; SPSS, Statistical Package for the Social Sciences},
 author = {L. Schultz and M.-G. Zurich and M. Culot and A. da Costa and C. Landry and P. Bellwon and T. Kristl and K. H{\"o}rmann and S. Ruzek and S. Aiche and K. Reinert and C. Bielow and F. Gosselet and R. Cecchelli and C. G. Huber and O. H.-U. Schroeder and A. Gramowski-Voss and D. G. Weiss and A. Bal-Price},
 journal = {Toxicology in Vitro},
 keywords = {In vitro blood brain barrier; MEA; Neuronal network culture; OMICs; Drug development; 3D brain culture},
 month = {December},
 note = {Online publication: May 2015},
 number = {1},
 pages = {138--165},
 publisher = {Elsevier B.V.},
 title = {Evaluation of drug-induced neurotoxicity based on metabolomics, proteomics and electrical activity measurements in complementary CNS in vitro models},
 url = {http://publications.imp.fu-berlin.de/1736/},
 volume = {30},
 year = {2015}
}

Enrico Siragusa, “Approximate string matching for high-throughput sequencing”, p. 127, 2015-07-23.

cite this publication

@phdthesis{fu_mi_publications2507,
 abstract = {Over the past years, high-throughput sequencing (HTS) has become an invaluable method of investigation in molecular and medical biology. HTS technologies allow to sequence cheaply and rapidly an individual?s DNA sample under the form of billions of short DNA reads. The ability to assess the content of a DNA sample at base-level resolution opens the way to a myriad of applications, including individual genotyping and assessment of large structural variations, measurement of gene expression levels and characterization of epigenetic features. Nonetheless, the quantity and quality of data produced by HTS instruments call for computationally efficient and accurate analysis methods. In this thesis, I present novel methods for the mapping of high-throughput sequencing DNA reads, based on state of the art approximate string matching algorithms and data structures. Read mapping is a fundamental step of any HTS data analysis pipeline in resequencing projects, where DNA reads are reassembled by aligning them back to a previously known reference genome. The ingenuity of approximate string matching methods is crucial to design efficient and accurate read mapping tools. In the first part of this thesis, I cover practical indexing and filtering methods for exact and approximate string matching. I present state of the art algorithms and data structures, give their pseudocode and discuss their implementation. Furthermore, I provide all implementations within SeqAn, the generic C++ template library for sequence analysis, which is freely available under http://www.seqan.de/. Subsequently, I experimentally evaluate all implemented methods, with the aim of guiding the engineering of new sequence alignment software. To the best of my knowledge, this is the first study providing a comprehensive exposition, implementation and evaluation of such methods. In the second part of this thesis, I turn to the engineering and evaluation of read mapping tools. First, I present a novel method to find all mapping locations per read within a user- defined error rate; this method is published in the peer-reviewed journal Nucleic Acids Research and packaged in a open source tool nicknamed Masai. Afterwards, I generalize this method to quickly report all co-optimal or suboptimal mapping locations per read within a user-defined error rate; this method, packaged in a tool called Yara, provides a more practical, yet sound solution to the read mapping problem. Extensive evaluations, both on simulated and real datasets, show that Yara has better speed and accuracy than de-facto standard read mapping tools.},
 author = {Enrico Siragusa},
 month = {July},
 school = {Freie Universit{\"a}t Berlin},
 title = {Approximate string matching for high-throughput sequencing},
 url = {http://publications.imp.fu-berlin.de/2507/},
 year = {2015}
}

Jialu Hu, “Algorithms to Identify Functional Orthologs And Functional Modules from High- Throughput Data”, 2015-01-07.

cite this publication

@phdthesis{fu_mi_publications2521,
 abstract = {Many studies in the last decade suggest that the biological network topology supplementing the genome is another important source of biological information for understanding the fundamental principle of life processes. A typical approach aiming to gain insights from the network information is network alignment. It provides a promising framework to understand the organization, function and evolution of molecular networks. However, current algorithms encounter their bottlenecks in terms of scalability, speed and so forth when applied to analyze multiple networks. Hence, it is desired to develop novel, efficient strategies to cope with the rapidly growing data in this particular field. In this thesis, we present two new network alignment algorithms, LocalAli and NetCoffee, and their applications in the analysis of biological data. Both of the two algorithms focus on the problem of multiple network alignment, but they run into different directions: local alignment and global alignment. LocalAli is an evolutionary-based local alignment approach that aims to identify functionally conserved modules from multiple biological networks. In this algorithm, a computational framework is firstly proposed to reconstruct the evolution history of functionally conserved modules. NetCoffee is a global alignment approach with a goal to detect function-oriented ortholog groups from multiple biological networks. The two algorithms have been applied to several real-world datasets. The results show that both Localali and Netcoffee provide substantial improvements to current algorithms in terms of several criteria such as scalability, coverage and consistency. All the test datasets, binaries and source code used for this thesis are freely available at https://code.google.com/p/localali/ and https://code.google.com/p/netcoffee/.},
 author = {Jialu Hu},
 month = {January},
 school = {Freie Universit{\"a}t Berlin},
 title = {Algorithms to Identify Functional Orthologs And Functional Modules from High- Throughput Data},
 url = {http://publications.imp.fu-berlin.de/2521/},
 year = {2015}
}

Manuel Holtgrewe, “Engineering Algorithms for Personal Genome Pipelines”, 2015-11-11.

cite this publication

@phdthesis{fu_mi_publications2530,
 abstract = {Recent technical advances in high-throughput sequencing technologies and their commercial availability at low costs have paved the way for revolutionary opportunities in the life sciences. One milestone was reaching the \$1000 genome, allowing to determine the genetic makeup of hundreds of human individuals within a week for less than \$1000 each. This ongoing revolution of the life sciences creates new challenges for the software and algorithms that are processing this data. In my thesis, I consider a typical software pipeline for determining the genome of a human individual. For the preprocessing pipeline step, I describe a method for error correction and consider the comparison of such methods. For the read mapping step, I provide a formal definition of read mapping and I present a software package implementing a benchmark for read mapping, based on my formal definition. I then describe the implementation, parallelisation, and engineering of a fully sensitive read mapper and evaluate its performance. For the variant calling step, I present a method for the prediction of insertion breakpoints and the assembly of large insertions. Of course, such a pipeline is not limited to the processing of human data but it is also applicable to data from other mammals or organisms with smaller and less complex genomes. The presented work is available as an efficient open source C++ implementation, either as parts of the SeqAn library or as programs using SeqAn.},
 author = {Manuel Holtgrewe},
 month = {November},
 school = {Freie Universit{\"a}t Berlin},
 title = {Engineering Algorithms for Personal Genome Pipelines},
 url = {http://publications.imp.fu-berlin.de/2530/},
 year = {2015}
}

Sandro Andreotti, “Linear Programming and Integer Linear Programming in Bioinformatics”, 2015-02-06.

cite this publication

@phdthesis{fu_mi_publications2531,
 abstract = {A wide range of important problems related to bioinformatics and computational biology are optimization problems asking for a solution that minimizes or maximizes a certain objective function. Often, these problems are combinatorial optimization problems that can be formulated as integer linear programs. While for some of these problems polynomial time algorithms are known, for many other problems it is unlikely that such algorithms exist. However, much work has been dedicated to develop algorithms that are capable of solving many interesting integer linear programming problems on real live instances with acceptable memory and running time requirements. These algorithms are implemented in a variety of free or commercial solver software packages. In situations where the performance of general purpose solvers is insufficient, often problem specific integer linear programming techniques can be applied that take advantage of knowledge about the particular structure of the integer linear programming formulation to solve the problem in a much more time- or space-efficient way. In this thesis we present our algorithmic approaches to three relevant bioinformatic problems, each involving certain linear programming and integer linear programming techniques. The first problem is the de novo peptide sequencing problem, which consists in identifying a peptide's sequence solely from its tandem mass spectrum without any additional information stored in genome databases or protein databases. This problem can be formulated as a graph theoretical problem asking for the computation of a longest antisymmetric path in a directed acyclic graph. The particular structure of the associated integer linear programming formulation facilitates the application of a technique called Lagrangian relaxation, which yields an algorithm that outperforms state-of-the-art commercial integer linear programming solvers by orders of magnitude. The second problem is the isoform inference and abundance estimation problem from RNA-Seq data. This problem consists in predicting a set of expressed RNA isoforms, i.e., full length RNA transcripts corresponding to alternative splice variants, together with an estimate of their individual expression levels. We apply a linear programming technique called delayed column generation, which allows us to increase the search space without explicitly enumerating the potentially huge set of candidate isoforms. As a consequence, our approach allows for the identification of isoforms that otherwise could not be recovered due to incomplete read coverage. A central component of our delayed column generation algorithm is an integer linear programming formulation. The third problem is the duplication-loss alignment problem, which asks for a labeled alignment of two genome sequences that implies the minimal number of loss and duplication events in the evolutionary history from an unknown nearest common ancestor. In a labeled alignment, every unaligned gene must be labeled either as a loss or as the product of a duplication event. Once an optimal labeled alignment has been computed, a common ancestor genome with minimal implied evolutionary operations can be derived in a straight forward way. In our approach we identified problem specific cutting planes and developed efficient separation algorithms to obtain a branch and cut algorithm that is several orders of magnitude faster than existing approaches based on integer linear programming.},
 author = {Sandro Andreotti},
 month = {February},
 school = {Freie Universit{\"a}t Berlin},
 title = {Linear Programming and Integer Linear Programming in Bioinformatics},
 url = {http://publications.imp.fu-berlin.de/2531/},
 year = {2015}
}

Xintian Arthur You, “Tailored Analysis in Studying Transcriptome Landscape”, 2015-12-22.

cite this publication

@phdthesis{fu_mi_publications2532,
 abstract = {The knowledge of the transcriptome landscape is crucial in molecular biology, and increasingly more important for disease diagnosis and treatments. Broadly speaking, three layers contribute to the importance of the transcriptome landscape. First, the profile of all isoforms of protein-coding genes determines the development path of cells and organisms. Second, the profile of regulatory elements modulates the activity of protein-coding genes. Third, the interplay of protein-coding genes and regulatory elements shapes the dynamic property of transcriptome landscape. Identifying the players in the regulatory network is the first step for reverse-engineering molecular biology. In this thesis, I present four tailored analyses on projects belonging to the first two layers. First, a hybrid assembly pipeline is developed for identification of transcriptome independent of genomic sequences. By combining two complementary sequencing technologies in conjunction with efficient cDNA normalization, a high quality transcriptome can be characterized. It out- performs other assembly tools that focus on one type of input data, and the results are experimentally validated. Second, an analysis framework is developed to characterize full-length transcripts. By tailoring tools for long read-length sequencing technology, transcriptome landscape could be examined with greater detail. Moreover, the association of different RNA processing events could be experimentally measured. The application on fly Dscam gene transcripts resolved the independent splicing hypothesis and calls for re- examination of previous experiments. The application on rat brain greatly enhanced the transcriptome annotation, which is crucial for the neuroscience community that use rat as a model organism. Third, a de novo microRNA prediction tools is presented. By designing sequencing experiments that capture snapshots of miRNA biogenesis process, not only mature and precursor miRNAs could be identified, but also the information on miRNA processing and modification could be learnt. Proof- of-principle experiments on well-studies organism like mouse and C. elegans demonstrate the efficacy and application potential of this method. Finally, a customized pipeline is developed for profiling and characterizing circRNAs. By examining potential splicing junctions based on local alignments, circRNAs can be identified from the otherwise neglected RNA- Seq data. Tens of thousands of circRNAs are identified and quantified in mouse, rat and fly. Further experiments demonstrate that circRNAs are enriched in brain synapses and participate in brain development and neuronal homeostatic plasticity. In summary, this thesis presents four tailored analyses on different aspects of transcriptome landscape. The methods can be used in conjunction towards an integrated understanding of molecular biology and medicine.},
 author = {Xintian Arthur You},
 month = {December},
 school = {Freie Universit{\"a}t Berlin},
 title = {Tailored Analysis in Studying Transcriptome Landscape},
 url = {http://publications.imp.fu-berlin.de/2532/},
 year = {2015}
}

Britta Weber, “Reconstruction of Microtubule Centerlines from Electron Tomograms”, 2015-12-21.

cite this publication

@phdthesis{fu_mi_publications2544,
 abstract = {The organization of the mitotic spindle, a structure that separates the chromosomes during cell division, is an active research topic in molecular cell biology. It is composed of microtubules, elongated tubular macromolecules with a diameter of 25 nm. The only volumetric imaging technique that is available to a wide community and provides the required resolution to capture details about microtubules is electron tomography. However, the automatic detection of microtubules in electron tomograms is a difficult task due to the low contrast of the data. Furthermore, thick samples have to be cut into 300 nm thin sections before electron tomography can be applied. Software for automatically segmentation and stitching of the microtubules are not available and therefore these tasks have to be performed manually. Unfortunately, manual segmentation is time consuming for large samples and manual stitching of the tomograms is often infeasible due to the lack of prominent features for registration. Conclusions drawn from electron tomographic data is currently mostly based on either small samples containing few microtubules or single sections of complex structures. Consequently, simple properties, such as the length of microtubules in the spindle or their number, are still unknown for most model organisms. In this thesis, we present methods for 1) an automatic segmentation of microtubule centerlines in electron tomograms, and 2) an automatic stitching of the lines extracted from serial sections. For the centerline segmentation, we use 3D template matching and exploit knowledge about shape of microtubules and microscopy artifacts to design the templates. For the registration of the lines, we present a way to model the orientation of lines as a mixture of Fisher-Mises distributions where we estimate transformation parameters with the expectation maximization algorithm. The final line matching problem is formulated in terms of a probabilistic graphical model. To find the correct correspondences of line ends, we use belief propagation. We handle the poor convergence properties of this algorithm by detecting ambiguous and conflicting assignments of lines automatically. An expert can then influence the final output of the algorithm by solving conflicts manually. A detailed error analysis on true biological data and assessment of the reliability of the results is the prerequisite for analyzing the resulting line representations of the microtubules. To this end, the developed workflow for segmenting and stitching of microtubule centerlines is evaluated on plasticembedded samples of C. elegans early embryos and of spindles from X. laevis egg extracts. Our results suggest that the output of the presented algorithms together with little manual correction is of sufficient quality to allow a detailed analysis of dense microtubule networks. Finally, we exemplarily show results for the centrosome of a C. elegans mitotic spindle.},
 author = {Britta Weber},
 month = {December},
 school = {Freie Universit{\"a}t Berlin},
 title = {Reconstruction of Microtubule Centerlines from Electron Tomograms},
 url = {http://publications.imp.fu-berlin.de/2544/},
 year = {2015}
}

2014

J. Hu, B. Kehr, K. Reinert, “NetCoffee: a fast and accurate global alignment approach to identify functionally conserved proteins in multiple networks”, vol. 30, iss. 4, 2014-02-15.

cite this publication

@article{fu_mi_publications1399,
 abstract = {Motivation: Owing to recent advancements in high-throughput technologies, protein?protein interaction networks of more and more species become available in public databases. The question of how to identify functionally conserved proteins across species attracts a lot of attention in computational biology. Network alignments provide a systematic way to solve this problem. However, most existing alignment tools encounter limitations in tackling this problem. Therefore, the demand for faster and more efficient alignment tools is growing.

Results: We present a fast and accurate algorithm, NetCoffee, which allows to find a global alignment of multiple protein?protein interaction networks. NetCoffee searches for a global alignment by maximizing a target function using simulated annealing on a set of weighted bipartite graphs that are constructed using a triplet approach similar to T-Coffee. To assess its performance, NetCoffee was applied to four real datasets. Our results suggest that NetCoffee remedies several limitations of previous algorithms, outperforms all existing alignment tools in terms of speed and nevertheless identifies biologically meaningful alignments.

Availability: The source code and data are freely available for download under the GNU GPL v3 license at https://code.google.com/p/netcoffee/. },
 author = {J. Hu and B. Kehr and K. Reinert},
 journal = {Bioinformatics},
 month = {February},
 number = {4},
 pages = {540--548},
 publisher = {Oxford University Press},
 title = {NetCoffee: a fast and accurate global alignment approach to identify functionally conserved proteins in multiple networks},
 url = {http://publications.imp.fu-berlin.de/1399/},
 volume = {30},
 year = {2014}
}

S. Wandelt, D. Deng, S. Gerdjikov, S. Mishra, P. Mitankin, M. Patil, E. Siragusa, A. Tiskin, W. Wang, J. Wang, U. Leser, “State-of-the-art in String Similarity Search and Join”, vol. 43, iss. 1, 2014-03.

cite this publication

@article{fu_mi_publications1401,
 author = {S. Wandelt and D. Deng and S. Gerdjikov and S. Mishra and P. Mitankin and M. Patil and E. Siragusa and A. Tiskin and W. Wang and J. Wang and U. Leser},
 journal = {SIGMOD Record},
 month = {March},
 number = {1},
 title = {State-of-the-art in String Similarity Search and Join},
 url = {http://publications.imp.fu-berlin.de/1401/},
 volume = {43},
 year = {2014}
}

B. Kehr, K. Trappe, M. Holtgrewe, K. Reinert, “Genome alignment with graph data structures: a comparison”, vol. 15, 2014-04-09.

cite this publication

@article{fu_mi_publications1437,
 abstract = {Background

Recent advances in rapid, low-cost sequencing have opened up the opportunity to study complete genome sequences. The computational approach of multiple genome alignment allows investigation of evolutionarily related genomes in an integrated fashion, providing a basis for downstream analyses such as rearrangement studies and phylogenetic inference.

Graphs have proven to be a powerful tool for coping with the complexity of genome-scale sequence alignments. The potential of graphs to intuitively represent all aspects of genome alignments led to the development of graph-based approaches for genome alignment. These approaches construct a graph from a set of local alignments, and derive a genome alignment through identification and removal of graph substructures that indicate errors in the alignment.

Results

We compare the structures of commonly used graphs in terms of their abilities to represent alignment information. We describe how the graphs can be transformed into each other, and identify and classify graph substructures common to one or more graphs. Based on previous approaches, we compile a list of modifications that remove these substructures.

Conclusion

We show that crucial pieces of alignment information, associated with inversions and duplications, are not visible in the structure of all graphs. If we neglect vertex or edge labels, the graphs differ in their information content. Still, many ideas are shared among all graph-based approaches. Based on these findings, we outline a conceptual framework for graph-based genome alignment that can assist in the development of future genome alignment tools. },
 author = {B. Kehr and K. Trappe and M. Holtgrewe and K. Reinert},
 journal = {BMC Bioinformatics},
 month = {April},
 publisher = {BioMed Central},
 title = {Genome alignment with graph data structures: a comparison},
 url = {http://publications.imp.fu-berlin.de/1437/},
 volume = {15},
 year = {2014}
}

R. Rahn, D. Weese, K. Reinert, “Journaled string tree--a scalable data structure for analyzing thousands of similar genomes on your laptop”, 2014-07-15.

cite this publication

@article{fu_mi_publications1448,
 abstract = {Motivation: Next-generation sequencing (NGS) has revolutionized biomedical research in the past decade and led to a continuous stream of developments in bioinformatics, addressing the need for fast and space-efficient solutions for analyzing NGS data. Often researchers need to analyze a set of genomic sequences that stem from closely related species or are indeed individuals of the same species. Hence, the analyzed sequences are similar. For analyses where local changes in the examined sequence induce only local changes in the results, it is obviously desirable to examine identical or similar regions not repeatedly.

Results: In this work, we provide a datatype that exploits data parallelism inherent in a set of similar sequences by analyzing shared regions only once. In real-world experiments, we show that algorithms that otherwise would scan each reference sequentially can be speeded up by a factor of 115.

Availability: The data structure and associated tools are publicly available at http://www.seqan.de/projects/jst and are part of SeqAn, the C++ template library for sequence analysis.

Contact: rene.rahn@fu-berlin.de},
 author = {R. Rahn and D. Weese and K. Reinert},
 journal = {Bioinformatics},
 month = {July},
 title = {Journaled string tree--a scalable data structure for analyzing thousands of similar genomes on your laptop},
 url = {http://publications.imp.fu-berlin.de/1448/},
 year = {2014}
}

M. Walzer, L. E. Pernas, S. Nasso, W. Bittremieux, S. Nahnsen, P. Kelchtermans, P. Pichler, H. W. P. van den Toorn, A. Staes, J. Vandenbussche, M. Mazanek, T. Taus, R. A. Scheltema, C. D. Kelstrup, L. Gatto, B. van Breukelen, S. Aiche, D. Valkenborg, K. Laukens, K. S. Lilley, J. V. Olsen, A. J. R. Heck, K. Mechtler, R. Aebersold, K. Gevaert, J. A. Vizcaino, H. Hermjakob, O. Kohlbacher, L. Martens, “qcML: An Exchange Format for Quality Control Metrics from Mass Spectrometry Experiments”, vol. 13, iss. 8, 2014-08-01.

cite this publication

@article{fu_mi_publications1449,
 abstract = {Quality control is increasingly recognized as a crucial
aspect of mass spectrometry based proteomics. Several
recent papers discuss relevant parameters for quality
control and present applications to extract these from the
instrumental raw data. What has been missing, however,
is a standard data exchange format for reporting these
performance metrics. We therefore developed the qcML
format, an XML-based standard that follows the design
principles of the related mzML, mzIdentML, mzQuantML,
and TraML standards from the HUPO-PSI (Proteomics
Standards Initiative). In addition to the XML format, we
also provide tools for the calculation of a wide range of
quality metrics as well as a database format and interconversion tools, so that existing LIMS systems can easily add relational storage of the quality control data to their existing schema. We here describe the qcML specification, along with possible use cases and an illustrative example of the subsequent analysis possibilities. All information about qcML is available at http://code.google.com/p/qcml. Molecular \& Cellular Proteomics 13: 10.1074/mcp.M113.035907, 1905?1913, 2014.},
 author = {M. Walzer and L. E. Pernas and S. Nasso and W. Bittremieux and S. Nahnsen and P. Kelchtermans and P. Pichler and H. W. P. van den Toorn and A. Staes and J. Vandenbussche and M. Mazanek and T. Taus and R. A. Scheltema and C. D. Kelstrup and L. Gatto and B. van Breukelen and S. Aiche and D. Valkenborg and K. Laukens and K. S. Lilley and J. V. Olsen and A. J. R. Heck and K. Mechtler and R. Aebersold and K. Gevaert and J. A. Vizcaino and H. Hermjakob and O. Kohlbacher and L. Martens},
 journal = {Molecular \& Cellular Proteomics},
 month = {August},
 number = {8},
 pages = {1905--1913},
 title = {qcML: An Exchange Format for Quality Control Metrics from Mass Spectrometry Experiments},
 url = {http://publications.imp.fu-berlin.de/1449/},
 volume = {13},
 year = {2014}
}

M. H. Schulz, D. Weese, M. Holtgrewe, V. Dimitrova, S. Niu, K. Reinert, H. Richard, “Fiona: a parallel and automatic strategy for read error correction”, vol. 30, iss. 17, 2014.

cite this publication

@article{fu_mi_publications1451,
 abstract = {Motivation: Automatic error correction of high-throughput sequencing data can have a dramatic impact on the amount of usable base pairs and their quality. It has been shown that the performance of tasks such as de novo genome assembly and SNP calling can be dramatically improved after read error correction. While a large number of methods specialized for correcting substitution errors as found in Illumina data exist, few methods for the correction of indel errors, common to technologies like 454 or Ion Torrent, have been proposed.Results: We present Fiona, a new stand-alone read error{\^a}??correction method. Fiona provides a new statistical approach for sequencing error detection and optimal error correction and estimates its parameters automatically. Fiona is able to correct substitution, insertion and deletion errors and can be applied to any sequencing technology. It uses an efficient implementation of the partial suffix array to detect read overlaps with different seed lengths in parallel. We tested Fiona on several real datasets from a variety of organisms with different read lengths and compared its performance with state-of-the-art methods. Fiona shows a constantly higher correction accuracy over a broad range of datasets from 454 and Ion Torrent sequencers, without compromise in speed.Conclusion: Fiona is an accurate parameter-free read error{\^a}??correction method that can be run on inexpensive hardware and can make use of multicore parallelization whenever available. Fiona was implemented using the SeqAn library for sequence analysis and is publicly available for download at http://www.seqan.de/projects/fiona.Contact: mschulz@mmci.uni-saarland.de or hugues.richard@upmc.frSupplementary information: Supplementary data are available at Bioinformatics online.},
 author = {M. H. Schulz and D. Weese and M. Holtgrewe and V. Dimitrova and S. Niu and K. Reinert and H. Richard},
 journal = {Bioinformatics},
 number = {17},
 pages = {i356--i363},
 title = {Fiona: a parallel and automatic strategy for read error correction},
 url = {http://publications.imp.fu-berlin.de/1451/},
 volume = {30},
 year = {2014}
}

H. Hauswedell, J. Singer, K. Reinert, “Lambda: the local aligner for massive biological data”, vol. 30, iss. 17, 2014-09.

cite this publication

@article{fu_mi_publications1453,
 abstract = {MOTIVATION:Next-generation sequencing technologies produce unprecedented amounts of data, leading to completely new research fields. One of these is metagenomics, the study of large-size DNA samples containing a multitude of diverse organisms. A key problem in metagenomics is to functionally and taxonomically classify the sequenced DNA, to which end the well-known BLAST program is usually used. But BLAST has dramatic resource requirements at metagenomic scales of data, imposing a high financial or technical burden on the researcher. Multiple attempts have been made to overcome these limitations and  present a viable alternative to BLAST.RESULTS:In this work we present Lambda, our own alternative for BLAST in the context of sequence classification. In our tests, Lambda often outperforms the best tools at reproducing BLAST's results and is the fastest compared with the current state of the art at comparable levels of sensitivity.
AVAILABILITY AND IMPLEMENTATION:Lambda was implemented in the SeqAn open-source C++ library for sequence analysis and is publicly available for download at http://www.seqan.de/projects/lambda.
CONTACT:hannes.hauswedell@fu-berlin.de
SUPPLEMENTARY INFORMATION:Supplementary data are available at Bioinformatics online.},
 author = {H. Hauswedell and J. Singer and K. Reinert},
 journal = {Bioinformatics (Oxford, England)},
 month = {September},
 number = {17},
 pages = {i349--i355},
 publisher = {Oxford University Press},
 title = {Lambda: the local aligner for massive biological data},
 url = {http://publications.imp.fu-berlin.de/1453/},
 volume = {30},
 year = {2014}
}

K. Trappe, A.-K. Emde, H.-C. Ehrlich, K. Reinert, “Gustaf: Detecting and correctly classifying SVs in the NGS twilight zone”, vol. 30, iss. 24, 2014-07-14.

cite this publication

@article{fu_mi_publications1455,
 abstract = {MOTIVATION:
The landscape of structural variation (SV) including complex duplication and translocation patterns is far from resolved. SV detection tools usually exhibit low agreement, are often geared toward certain types or size ranges of variation and struggle to correctly classify the type and exact size of SVs.

RESULTS:
We present Gustaf (Generic mUlti-SpliT Alignment Finder), a sound generic multi-split SV detection tool that detects and classifies deletions, inversions, dispersed duplications and translocations of {\ensuremath{<}}span class='mathrm'{\ensuremath{>}}ge{\ensuremath{<}}/span{\ensuremath{>}}30 bp. Our approach is based on a generic multi-split alignment strategy that can identify SV breakpoints with base pair resolution. We show that Gustaf correctly identifies SVs, especially in the range from 30 to 100 bp, which we call the next-generation sequencing (NGS) twilight zone of SVs, as well as larger SVs \&gt;500 bp. Gustaf performs better than similar tools in our benchmark and is furthermore able to correctly identify size and location of dispersed duplications and translocations, which otherwise might be wrongly classified, for example, as large deletions. Availability and implementation: Project information, paper benchmark and source code are available via http://www.seqan.de/projects/gustaf/.

CONTACT:kathrin.trappe@fu-berlin.de.},
 author = {K. Trappe and A.-K. Emde and H.-C. Ehrlich and K. Reinert},
 journal = {Bioinformatics},
 month = {July},
 number = {24},
 pages = {3484--3490},
 publisher = {Oxford University Press},
 title = {Gustaf: Detecting and correctly classifying SVs in the NGS twilight zone},
 url = {http://publications.imp.fu-berlin.de/1455/},
 volume = {30},
 year = {2014}
}

V. Neu, C. Bielow, K. Reinert, C. G. Huber, “Ultrahigh-performance liquid chromatography-ultraviolet absorbance detection-high-resolution-mass spectrometry combined with automated data processing for studying the kinetics of oxidative thermal degradation of thyroxine in the solid state”, vol. 1371, 2014-12.

cite this publication

@article{fu_mi_publications1463,
 abstract = {Received 29 June 2014
Received in revised form
29 September 2014
Accepted 24 October 2014 Available online 30 October 2014
Keywords:
Thyroxine
Ultrahigh-performance liquid chromatography
Electrospray ionization Orbitrap mass spectrometry
Kinetics
Drug degradation
Bioinformatics
1. Introduction
Levothyroxine (T4) is a well-known and widely used drug for the treatment of hypothyroidism. The therapy is usually based on substitution of the natural hormone by a long-term treatment with synthetic levothyroxine [1]. It is a narrow therapeutic index drug, for which individual dosage levels of patients need to be established over several weeks. Consequently, products differing in content of levothyroxine due to compound instability or problems in formu- lation can lead to wrong medication and potentially cause severe health problems [2,3].
? Corresponding author. Tel.:+43 0 662 8044 5704; fax: +43 0 662 8044 5751. E-mail addresses: volker.a.neu@basf.com (V. Neu), chris.bielow@fu-berlin.de (C. Bielow), reinert@inf.fu-berlin.de (K. Reinert), c.huber@sbg.ac.at (C.G. Huber).
http://dx.doi.org/10.1016/j.chroma.2014.10.071
0021-9673/{\copyright} 2014 Elsevier B.V. All rights reserved.
abstract
Levothyroxine as active pharmaceutical ingredient of formulations used for the treatment of hypothy- roidism is distributed worldwide and taken by millions of people. An important issue in terms of compound stability is its capability to react with ambient oxygen, especially in case of long term com- pound storage at elevated temperature. In this study we demonstrate that ultrahigh-performance liquid chromatography coupled to UV spectrometry and high-resolution mass spectrometry (UHPLC-UV-HRMS) represent very useful approaches to investigate the influence of ambient oxygen on the degradation kinet- ics of levothyroxine in the solid state at enhanced degradation conditions. Moreover, the impurity pattern of oxidative degradation of levothyroxine is elucidated and classified with respect to degradation kinetics at different oxygen levels. Kinetic analysis of thyroxine bulk material at 100 ? C reveals bi-phasic degra- dation kinetics with a distinct change in degradation phases dependent on the availability of oxygen. The results clearly show that contact of the bulk material to ambient oxygen is a key factor for fast compound degradation. Furthermore, the combination of time-resolved HRMS data and automated data processing is shown to allow insights into the kinetics and mechanism of impurity formation on individual com- pound basis. By comparing degradation profiles, four main classes of profiles linked to reaction pathways of thyroxine degradation were identifiable. Finally, we show the capability of automated data process- ing for the matching of different stressing conditions, in order to extract information about mechanistic similarities. As a result, degradation kinetics is influenced by factors like availability of oxygen, stressing time, or stressing temperature, while the degradation mechanisms appear to be conserved.},
 author = {V. Neu and C. Bielow and K. Reinert and C. G. Huber},
 journal = {Journal of chromatography A},
 month = {December},
 pages = {196--203},
 title = {Ultrahigh-performance liquid chromatography-ultraviolet absorbance detection-high-resolution-mass spectrometry combined with automated data processing for studying the kinetics of oxidative thermal degradation of thyroxine in the solid state},
 url = {http://publications.imp.fu-berlin.de/1463/},
 volume = {1371},
 year = {2014}
}

Birte Kehr, “Contributions to computing and modeling multiple whole-genome alignments”, 2014-06-11.

cite this publication

@phdthesis{fu_mi_publications2525,
 abstract = {Recent advances in sequencing technologies have opened up the opportunity to study whole genomes at the nucleotide level. Similarities in the nucleotide sequences of genomes provide new insights in the relationships of organisms and species. Multiple whole-genome alignments represent these similarities, however their computation is challenging. In contrast to approaches for other sequence alignment problems, genome alignment methods have to deal with very long sequences and with non-colinearity of similarities. This thesis makes three contributions to the development of multiple whole-genome alignment methods. The prevailing strategy of such methods is to combine a set of local alignments to a global genome alignment. This thesis suggests an efficient and fully sensitive local alignment approach, compares graph data structures for representing genome alignments, and describes hidden rearrangement breakpoints that become visible only in the comparison of more than two genomes. All three contributions provide potential for significant improvements to the computation or modeling of genome alignments. In a comparison to other local alignment approaches, the new local aligner is the fastest of three fully sensitive ones and competitive with seed-and-extend approaches despite having full sensitivity. The assessment of graph data structures describes for the first time all graphs using the same terminology, and demonstrates how the graph structures differ in their information content. Finally, an analysis of breakpoints in simulated genome alignments suggests that hidden breakpoints are abundant and relevant for measuring the accuracy of genome alignments. In summary, the three contributions provide a promising basis for future genome alignment methods.},
 author = {Birte Kehr},
 month = {June},
 school = {Freie Universit{\"a}t Berlin},
 title = {Contributions to computing and modeling multiple whole-genome alignments},
 url = {http://publications.imp.fu-berlin.de/2525/},
 year = {2014}
}

Alexandra Zerck, “Optimal precursor ion selection for LC-MS/MS based proteomics”, 2014-02-13.

cite this publication

@phdthesis{fu_mi_publications2527,
 abstract = {Shotgun proteomics with Liquid Chromatography (LC) coupled to Tandem Mass Spectrometry (MS/MS) is a key technology for protein identification and quantitation. Protein identification is done indirectly: detected peptide signals are fragmented byMS/MS and their sequence is reconstructed. Afterwards, the identified peptides are used to infer the proteins present in a sample. The problem of choosing the peptide signals that shall be identified with MS/MS is called precursor ion selection. Most workflows use data- dependent acquisition for precursor ion selection despite known drawbacks like data redundancy, limited reproducibility or a bias towards high-abundance proteins. In this thesis, we formulate optimization problems for different aspects of precursor ion selection to overcome these weaknesses. In the first part of this work we develop inclusion lists aiming at optimal precursor ion selection given different input information. We trace precursor ion selection back to known combinatorial problems and develop linear program (LP) formulations. The first method creates an inclusion list given a set of detected features in an LC-MS map. We show that this setting is an instance of the Knapsack Problem. The corresponding LP can be solved efficiently and yields inclusion lists that schedule more precursors than standard methods when the number of precursors per fraction is limited. Furthermore, we develop a method for inclusion list creation based on a list of proteins of interest. We employ retention time and detectability prediction to infer LC-MS features. Based on peptide detectability, we introduce protein detectabilities that reflect the likelihood of detecting and identifying a protein. By maximizing the sum of protein detectabilities we create an inclusion list of limited size that covers a maximum number of proteins. In the second part of the thesis, we focus on iterative precursor ion selection (IPS) with LC-MALDI MS/MS. Here, after a fixed number of acquired MS/MS spectra their identification results are evaluated and are used for the next round of precursor ion selection. We develop a heuristic which creates a ranked precursor list. The second method, IPS LP, is a combination of the two inclusion list scenarios presented in the first part. Additionally, a protein-based exclusion is part of the objective function. For evaluation, we compared both IPS methods to a static inclusion list (SPS) created before the beginning of MS/MS acquisition. We simulated precursor ion selection on three data sets of different complexity and show that IPS LP can identify the same number of proteins with fewer selected precursors. This improvement is especially pronounced for low abundance proteins. Additionally, we show that IPS LP decreases the bias to high abundance proteins. All presented algorithms were implemented in OpenMS, a software library for mass spectrometry. Finally, we present an online tool for IPS that has direct access to the instrument and controls the measurement.},
 author = {Alexandra Zerck},
 month = {February},
 school = {Freie Universit{\"a}t Berlin},
 title = {Optimal precursor ion selection for LC-MS/MS based proteomics},
 url = {http://publications.imp.fu-berlin.de/2527/},
 year = {2014}
}

Chris Bauer, “Exploiting Proteomics Data”, 2014-02-20.

cite this publication

@phdthesis{fu_mi_publications2537,
 abstract = {Proteomics plays a central role in understanding complex disease mechanisms, especially since it is well known that the effectors of biological functions are mostly proteins. Beside classical gel-based techniques especially Mass Spectrometry (MS) has emerged as the standard technique for proteomics experiments. MS-based proteomics has evolved into several different and partly complementary technologies. In this thesis we have analyzed data generated by the three complementary technologies: Matrix-Assisted Laser Desorption/Ionization (MALDI), Isobaric Tags for Relative and Absolute Quantitation (iTRAQ) and 2D Difference Gel Electrophoresis (DIGE). The three technologies are applied to an obesity-induced mouse model in order to gain relevant knowledge on biological processes involved in diabetes. The primary goal of this thesis is to develop and implement specifically tailored data analysis methods for each technology with the aim to improve quality and reliability of the results compared to standard evaluation workflows. The developed methods benefit from the fact that in proteomics a single protein is typically represented by several peptides showing more or less similar measurements. Combining this similarity information and advanced statistical testing, we are able to identify sets of potential biomarkers that may play an important role in diabetes disease mechanisms. The identified biomarkers are very well suited for building a classification engine to predict disease relations. However, peptides derived from the same protein may also show contradictory quantitations (e.g. a protein is two-fold up regulated and two- fold down regulated at the same time). This could be due to technical artifacts or biological properties (e.g. protein isoforms). We try to resolve these contradictions with PPINGUIN, a workflow developed for the reliable quantitation of iTRAQ experiments. Application of the developed methods leads to improved results compared to standard data evaluation methods. The three technologies have a complementary character and therefore a direct comparison is difficult and shows only a small overlap. But a comparison based on the more abstract level of biochemical pathways shows a surprisingly good agreement of the results. In order to better understand the complex processes involved in diabetes a major challenge remains in integrating the results with other ?omics? technologies.},
 author = {Chris Bauer},
 month = {February},
 school = {Freie Universit{\"a}t Berlin},
 title = {Exploiting Proteomics Data},
 url = {http://publications.imp.fu-berlin.de/2537/},
 year = {2014}
}

2013

E. Siragusa, D. Weese, K. Reinert, “Fast and accurate read mapping with approximate seeds and multiple backtracking”, vol. 41, iss. 7, 2013-01-28.

cite this publication

@article{fu_mi_publications1161,
 abstract = {We present Masai, a read mapper representing the state-of-the-art in terms of speed and accuracy. Our tool is an order of magnitude faster than RazerS 3 and mrFAST, 2?4 times faster and more accurate than Bowtie 2 and BWA. The novelties of our read mapper are filtration with approximate seeds and a method for multiple backtracking. Approximate seeds, compared with exact seeds, increase filtration specificity while preserving sensitivity. Multiple backtracking amortizes the cost of searching a large set of seeds by taking advantage of the repetitiveness of next-generation sequencing data. Combined together, these two methods significantly speed up approximate search on genomic data sets. Masai is implemented in C++ using the SeqAn library. The source code is distributed under the BSD license and binaries for Linux, Mac OS X and Windows can be freely downloaded from http://www.seqan.de/projects/masai.},
 author = {E. Siragusa and D. Weese and K. Reinert},
 journal = {Oxford Journals},
 month = {January},
 number = {7},
 pages = {e78},
 publisher = {Oxford University Press},
 title = {Fast and accurate read mapping with approximate seeds and multiple backtracking},
 url = {http://publications.imp.fu-berlin.de/1161/},
 volume = {41},
 year = {2013}
}

E. Siragusa, D. Weese, K. Reinert, “Scalable string similarity search/join with approximate seeds and multiple backtracking”, 2013.

cite this publication

@inproceedings{fu_mi_publications1225,
 abstract = {We present in this paper scalable algorithms for optimal string similarity search and join. Our methods are variations of those applied in Masai [15], our recently published tool for mapping high-throughput DNA sequencing data with unpreceded speed and accuracy. The key features of our approach are filtration with approximate seeds and methods for multiple backtracking. Approximate seeds, compared to exact seeds, increase filtration specificity while preserving sensitivity. Multiple backtracking amortizes the cost of searching a large set of seeds. Combined together, these two methods significantly speed up string similarity search and join operations. Our tool is implemented in C++ and OpenMP using the SeqAn library. The source code is distributed under the BSD license and can be freely downloaded from http://www.seqan.de/projects/edbt2013.},
 address = {New York, NY, USA},
 author = {E. Siragusa and D. Weese and K. Reinert},
 booktitle = {Proceedings of the Joint EDBT/ICDT 2013 Workshops},
 keywords = {approximate seeds, backtracking, banded Myers bit-vector, banded dynamic programming, filtration, radix tree, suffix tree},
 pages = {370--374},
 publisher = {ACM},
 series = {EDBT '13},
 title = {Scalable string similarity search/join with approximate seeds and multiple backtracking},
 url = {http://publications.imp.fu-berlin.de/1225/},
 year = {2013}
}

D. Weese, “Indices and Applications in High-Throughput Sequencing”, p. 196, 2013-06-05.

cite this publication

@phdthesis{fu_mi_publications1288,
 abstract = {Recent advances in sequencing technology allow to produce billions of base pairs per day in the form of reads of length 100 bp an longer and current developments promise the personal \$1,000 genome in a couple of years. The analysis of these unprecedented amounts of data demands for efficient data structures and algorithms. One such data structures is the substring index, that represents all substrings or substrings up to a certain length contained in a given text.
In this thesis we propose 3 substring indices, which we extend to be applicable to millions of sequences. We devise internal and external memory construction algorithms and a uniform framework for accessing the generalized suffix tree. Additionally we propose different index-based applications, e.g. exact and approximate pattern matching and different repeat search algorithms.
Second, we present the read mapping tool RazerS, which aligns millions of single or paired-end reads of arbitrary lengths to their potential genomic origin using either Hamming or edit distance. Our tool can work either lossless or with a user-defined loss rate at higher speeds. Given the loss rate, we present a novel approach that guarantees not to lose more reads than specified. This enables the user to adapt to the problem at hand and provides a seamless tradeoff between sensitivity and running time. We compare RazerS with other state-of-the-art read mappers and show that it has the highest sensitivity and a comparable performance on various real-world datasets.
At last, we propose a general approach for frequency based string mining, which has many applications, e.g. in contrast data mining. Our contribution is a novel and lightweight algorithm that is faster and uses less memory than the best available algorithms. We show its applicability for mining multiple databases with a variety of frequency constraints. As such, we use the notion of entropy from information theory to generalize the emerging substring mining problem to multiple databases. To demonstrate the improvement of our algorithm we compared to recent approaches on real-world experiments of various string domains, e.g. natural language, DNA, or protein sequences.},
 author = {D. Weese},
 keywords = {HTS; full-text index; frequency string mining; read mapping; SeqAn},
 month = {June},
 school = {Freie Universit{\"a}t Berlin},
 title = {Indices and Applications in High-Throughput Sequencing},
 url = {http://publications.imp.fu-berlin.de/1288/},
 year = {2013}
}

T. Steijger, J. F. Abril, P. G. Engström, F. Kokocinski, The RGASP Consortium, H. Richard, M. H. Schulz, D. Weese, T. Hubbard, R. Guigó, J. Harrow, P. Bertone, “Assessment of transcript reconstruction methods for RNA-seq”, vol. 10, iss. 12, 2013-11-03.

cite this publication

@article{fu_mi_publications1380,
 abstract = {We evaluated 25 protocol variants of 14 independent computational methods for exon identification, transcript reconstruction and expression-level quantification from RNA-seq data. Our results show that most algorithms are able to identify discrete transcript components with high success rates but that assembly of complete isoform structures poses a major challenge even when all constituent elements are identified. Expression-level estimates also varied widely across methods, even when based on similar transcript models. Consequently, the complexity of higher eukaryotic genomes imposes severe limitations on transcript recall and splice product discrimination that are likely to remain limiting factors for the analysis of current-generation RNA-seq data.},
 author = {T. Steijger and J. F. Abril and P. G. Engstr{\"o}m and F. Kokocinski and The RGASP Consortium and H. Richard and M. H. Schulz and D. Weese and T. Hubbard and R. Guig{\'o} and J. Harrow and P. Bertone},
 journal = {Nature Methods},
 month = {November},
 number = {12},
 pages = {1177--1184},
 publisher = {Nature Publishing Group},
 title = {Assessment of transcript reconstruction methods for RNA-seq},
 url = {http://publications.imp.fu-berlin.de/1380/},
 volume = {10},
 year = {2013}
}

V. Neu, C. Bielow, I. Gostomski, R. Wintringer, R. Braun, K. Reinert, P. Schneider, H. Stuppner, C. Huber, “Rapid and Comprehensive Impurity Profiling of Synthetic Thyroxine by Ultrahigh-Performance Liquid Chromatography–High-Resolution Mass Spectrometry”, vol. 85, iss. 6, 2013-02-08.

cite this publication

@article{fu_mi_publications1397,
 abstract = {Rapid and efficient quality control according to the public authority regulations is mandatory to guarantee safety of the pharmaceuticals and to save resources in the pharmaceutical industry. In the case of so-called "grandfather products" like the synthetic thyroid hormone thyroxine, strict regulations enforce a detailed chemical analysis in order to characterize potentially toxic or pharmacologically relevant impurities. We report a straightforward workflow for the comprehensive impurity profiling of synthetic thyroid hormones and impurities employing ultrahigh-performance liquid chromatography (UHPLC) hyphenated to high-resolution mass spectrometry (HRMS). Five different batches of synthetic thyroxin were analyzed resulting in the detection of 71 impurities within 3 min total analysis time. Structural elucidation of the compounds was accomplished via a combination of accurate mass measurements, computer based calculations of molecular formulas, multistage high-resolution mass spectrometry (HRMS(n)), and nuclear magnetic resonance spectroscopy, which enabled the identification of 71 impurities, of which 47 have been unknown so far. Thirty of the latter were structurally elucidated, including products of deiodination, aliphatic chain oxidation, as well as dimeric compounds as new class of thyroid hormone derivatives. Limits of detection for the thyroid compounds were in the 6 ng/mL range for negative electrospray ionization mass spectrometric detection in full scan mode. Within day and day-to-day repeatabilities of retention times and peak areas were below 0.5\% and 3.5\% R.SD. The performance characteristics of the method in terms of robustness and information content clearly show that UHPLC-HRMS is adequate for the rapid and reliable detection, identification, and semiquantitative determination of trace levels of impurities in synthetic pharmaceuticals.},
 author = {V. Neu and C. Bielow and I. Gostomski and R. Wintringer and R. Braun and K. Reinert and P. Schneider and H. Stuppner and C. Huber},
 journal = {Analytical Chemistry},
 month = {February},
 number = {6},
 pages = {3309--3317},
 publisher = {American Chemical Society},
 title = {Rapid and Comprehensive Impurity Profiling of Synthetic Thyroxine by Ultrahigh-Performance Liquid Chromatography?High-Resolution Mass Spectrometry},
 url = {http://publications.imp.fu-berlin.de/1397/},
 volume = {85},
 year = {2013}
}

V. Neu, C. Bielow, P. Schneider, K. Reinert, H. Stuppner, C. Huber, “Investigation of Reaction Mechanisms of Drug Degradation in the Solid State: A Kinetic Study Implementing Ultrahigh-Performance Liquid Chromatography and High-Resolution Mass Spectrometry for Thermally Stressed Thyroxine”, vol. 85, iss. 4, 2013-01-12.

cite this publication

@article{fu_mi_publications1398,
 abstract = {A reaction scheme was derived for the thermal degradation of thyroxine in the solid state, using data obtained from ultrahigh-performance liquid chromatography and high-resolution mass spectrometry (UHPLC-HRMS). To study the reaction mechanism and kinetics of the thermal degradation of the pharmaceutical in the solid state, a workflow was developed by generating compound-specific, time-dependent degradation or formation curves of at least 13 different degradation products. Such curves allowed one to distinguish between first- and second-generation degradation products, as well as impurities resulting from chemical synthesis. The structures of the degradation products were derived from accurate molecular masses and multistage mass spectrometry. Deiodination and oxidative side chain degradation were found to be the major degradation reactions, resulting in the formation of deiodinated thyroxines, as well as acetic acid, benzoic acid, formaldehyde, acetamide, hydroxyacetic acid, oxoacetic acid, hydroxyacetamide, or oxoacetamide derivatives of thyroxine or deiodinated thyroxine. Upon additional structural verification of mass spectrometric data using nuclear magnetic resonance spectroscopy, this comprehensive body of data sheds light on an elaborate, radical-driven reaction scheme, explaining the presence or formation of impurities in thermally stressed thyroxine.},
 author = {V. Neu and C. Bielow and P. Schneider and K. Reinert and H. Stuppner and C. Huber},
 journal = {Analytical Chemistry},
 month = {January},
 number = {4},
 pages = {2385--2390},
 publisher = {American Chemical Society},
 title = {Investigation of Reaction Mechanisms of Drug Degradation in the Solid State: A Kinetic Study Implementing Ultrahigh-Performance Liquid Chromatography and High-Resolution Mass Spectrometry for Thermally Stressed Thyroxine},
 url = {http://publications.imp.fu-berlin.de/1398/},
 volume = {85},
 year = {2013}
}

A. Zerck, E. Nordhoff, H. Lehrach, K. Reinert, “Optimal precursor ion selection for LC-MALDI MS/MS”, vol. 14, iss. 1, 2013-02-18.

cite this publication

@article{fu_mi_publications1400,
 abstract = {Background
Liquid chromatography mass spectrometry (LC-MS) maps in shotgun proteomics are often too complex to select every detected peptide signal for fragmentation by tandem mass spectrometry (MS/MS). Standard methods for precursor ion selection, commonly based on data dependent acquisition, select highly abundant peptide signals in each spectrum. However, these approaches produce redundant information and are biased towards high-abundance proteins.

Results
We present two algorithms for inclusion list creation that formulate precursor ion selection as an optimization problem. Given an LC-MS map, the first approach maximizes the number of selected precursors given constraints such as a limited number of acquisitions per RT fraction. Second, we introduce a protein sequence-based inclusion list that can be used to monitor proteins of interest. Given only the protein sequences, we create an inclusion list that optimally covers the whole protein set. Additionally, we propose an iterative precursor ion selection that aims at reducing the redundancy obtained with data dependent LC-MS/MS. We overcome the risk of erroneous assignments by including methods for retention time and proteotypicity predictions. We show that our method identifies a set of proteins requiring fewer precursors than standard approaches. Thus, it is well suited for precursor ion selection in experiments with limited sample amount or analysis time.

Conclusions
We present three approaches to precursor ion selection with LC-MALDI MS/MS. Using a well-defined protein standard and a complex human cell lysate, we demonstrate that our methods outperform standard approaches. Our algorithms are implemented as part of OpenMS and are available under http://www.openms.de.},
 author = {A. Zerck and E. Nordhoff and H. Lehrach and K. Reinert},
 journal = {BMC Bioinformatics},
 month = {February},
 number = {1},
 pages = {56},
 publisher = {BioMed Central},
 title = {Optimal precursor ion selection for LC-MALDI MS/MS},
 url = {http://publications.imp.fu-berlin.de/1400/},
 volume = {14},
 year = {2013}
}

L. de la Garza, J. Krüger, Ch. Schärfe, M. Röttig, S. Aiche, K. Reinert, O. Kohlbacher, “From the Desktop to the Grid: conversion of KNIME Workflows to gUSE”, 2013.

cite this publication

@inproceedings{fu_mi_publications1441,
 abstract = {The Konstanz Information Miner is a user-friendly
graphical workflow designer with a broad user base in industry and academia. Its broad range of embedded tools and its powerful data mining and visualization tools render it ideal for scientific workflows. It is thus used more and more in a broad range of applications. However, the free version typically runs on a desktop computer, restricting users if they want to tap into computing power. The grid and cloud User Support Environment is a free and open source project created for parallelized and distributed systems, but the creation of workflows with the included components has a steeper learning curve.
In this work we suggest an easy to implement solution
combining the ease-of-use of the Konstanz Information Miner
with the computational power of distributed computing infrastructures. We present a solution permitting the conversion of workflows between the two platforms. This enables a convenient development, debugging, and maintenance of scientific workflows on the desktop. These workflows can then be deployed on a cloud or grid, thus permitting large-scale computation.
To achieve our goals, we relied on a Common Tool Description
XML file format which describes the execution of arbitrary
programs in a structured and easily readable and parseable way. In order to integrate external programs into we employed the Generic KNIME Nodes extension.},
 author = {L. de la Garza and J. Kr{\"u}ger and Ch. Sch{\"a}rfe and M. R{\"o}ttig and S. Aiche and K. Reinert and O. Kohlbacher},
 booktitle = {IWSG (International Workshop on Science Gateways)},
 title = {From the Desktop to the Grid: conversion of KNIME Workflows to gUSE},
 url = {http://publications.imp.fu-berlin.de/1441/},
 year = {2013}
}

S. Aiche, “Inferring Proteolytic Processes from Mass Spectrometry Time Series Data”, 2013-09-30.

cite this publication

@phdthesis{fu_mi_publications1445,
 address = {Berlin, Germany},
 author = {S. Aiche},
 month = {September},
 school = {Freie Universit{\"a}t Berlin},
 title = {Inferring Proteolytic Processes from Mass Spectrometry Time Series Data},
 url = {http://publications.imp.fu-berlin.de/1445/},
 year = {2013}
}

S. Andreotti, K. Reinert, S. Canzar, “The duplication-loss small phylogeny problem: from cherries to trees.”, vol. 20, iss. 9, 2013-09.

cite this publication

@article{fu_mi_publications1454,
 abstract = {Abstract The reconstruction of the history of evolutionary genome-wide events among a set of related organisms is of great biological interest since it can help to reveal the genomic basis of phenotypes. The sequencing of whole genomes faciliates the study of gene families that vary in size through duplication and loss events, like transfer RNA. However, a high sequence similarity often does not allow one to distinguish between orthologs and paralogs. Previous methods have addressed this difficulty by taking into account flanking regions of members of a family independently. We go one step further by inferring the order of genes of (a set of) families for ancestral genomes by considering the order of these genes on sequenced genomes. We present a novel branch-and-cut algorithm to solve the two species small phylogeny problem in the evolutionary model of duplications and losses. On average, our implementation, DupLoCut, improves the running time of a recently proposed method in the experiments on six Vibrionaceae lineages by a factor of {\~A}?\&lt;88\&gt;{\^A}?200. Besides the mere improvement in running time, the efficiency of our approach allows us to extend our model from cherries of a species tree, that is, subtrees with two leaves, to the median of three species setting. Being able to determine the median of three species is of key importance to one of the most common approaches to ancestral reconstruction, and our experiments show that its repeated computation considerably reduces the number of duplications and losses along the tree both on simulated instances comprising 128 leaves and a set of Bacillus genomes. Furthermore, in our simulations we show that a reduction in cost goes hand in hand with an improvement of the predicted ancestral genomes. Finally, we prove that the small phylogeny problem in the duplication-loss model is NP-complete already for two species.},
 author = {S. Andreotti and K. Reinert and S. Canzar},
 journal = {Journal of Computational Biology},
 month = {September},
 number = {9},
 pages = {643--59},
 title = {The duplication-loss small phylogeny problem: from cherries to trees.},
 url = {http://publications.imp.fu-berlin.de/1454/},
 volume = {20},
 year = {2013}
}

S. Nahnsen, C. Bielow, K. Reinert, O. Kohlbacher, “Tools for Label-free Peptide Quantification”, vol. 12, iss. 3, 2013-03-01.

cite this publication

@article{fu_mi_publications1473,
 abstract = {The increasing scale and complexity of quantitative proteomics studies complicate subsequent analysis of the acquired data. Untargeted label-free quantification, based either on feature intensities or on spectral counting, is a method that scales particularly well with respect to the number of samples. It is thus an excellent alternative to labeling techniques. In order to profit from this scalability, however, data analysis has to cope with large amounts of data, process them automatically, and do a thorough statistical analysis in order to achieve reliable results. We review the state of the art with respect to computational tools for label-free quantification in untargeted proteomics. The two fundamental approaches are feature-based quantification, relying on the summed-up mass spectrometric intensity of peptides, and spectral counting, which relies on the number of MS/MS spectra acquired for a certain protein. We review the current algorithmic approaches underlying some widely used software packages and briefly discuss the statistical strategies for analyzing the data.},
 author = {S. Nahnsen and C. Bielow and K. Reinert and O. Kohlbacher},
 journal = {Molecular \& Cellular Proteomics},
 month = {March},
 number = {3},
 pages = {549--556},
 title = {Tools for Label-free Peptide Quantification},
 url = {http://publications.imp.fu-berlin.de/1473/},
 volume = {12},
 year = {2013}
}

Anne-Katrin Emde, “Next-generation sequencing algorithms - from read mapping to variant detection”, 2013-12-03.

cite this publication

@phdthesis{fu_mi_publications2526,
 abstract = {Next-Generation-Sequencing (NGS) has brought on a revolution in sequence analysis with its broad spectrum of applications ranging from genome resequencing to transcriptomics or metagenomics, and from fundamental research to diagnostics. The tremendous amounts of data necessitate highly efficient computational analysis tools for the wide variety of NGS applications. This thesis addresses a broad range of key computational aspects of resequencing applications, where a reference genome sequence is known and heavily used for interpretation of the newly sequenced sample. It presents tools for read mapping and benchmarking, for partial read mapping of small RNA reads and for structural variant/indel detection, and finally tools for detecting and genotyping SNVs and short indels. Our tools efficiently scale to large NGS data sets and are well-suited for advances in sequencing technology, since their generic algorithm design allows handling of arbitrary read lengths and variable error rates. Furthermore, they are implemented within the robust C++ library SeqAn, making them open-source, easily available, and potentially adaptable for the bioinformatics community. Among other applications, our tools have been integrated into a large-scale analysis pipeline and have been applied to large datasets, leading to interesting discoveries of human retrocopy variants and insights into the genetic causes of X-linked intellectual disabilities.},
 author = {Anne-Katrin Emde},
 month = {December},
 school = {Freie Universit{\"a}t Berlin},
 title = {Next-generation sequencing algorithms - from read mapping to variant detection},
 url = {http://publications.imp.fu-berlin.de/2526/},
 year = {2013}
}

2012

S. Andreotti, G. W. Klau, K. Reinert, “Antilope - A Lagrangian Relaxation Approach to the de novo Peptide Sequencing Problem”, vol. 9, iss. 2, 2012-03.

cite this publication

@article{fu_mi_publications1047,
 abstract = {Peptide sequencing from mass spectrometry data is a key step in proteome research. Especially de novo sequencing, the identification of a peptide from its spectrum alone, is still a challenge even for state-of-the-art algorithmic approaches. In this paper we present ANTILOPE, a new fast and flexible approach based on mathematical programming. It builds on the spectrum graph model and works with a variety of scoring schemes. ANTILOPE combines Lagrangian relaxation for solving an integer linear programming formulation with an adaptation of Yen?s k shortest paths algorithm. It shows a significant improvement in running time compared to mixed integer optimization and performs at the same speed like other state-of-the-art tools. We also implemented a generic probabilistic scoring scheme that can be trained automatically for a dataset of annotated spectra and is independent of the mass spectrometer type. Evaluations on benchmark data show that ANTILOPE is competitive to the popular state-of-the-art programs PepNovo and NovoHMM both in terms of run time and accuracy. Furthermore, it offers increased flexibility in the number of considered ion types. ANTILOPE will be freely available as part of the open source proteomics library OpenMS.},
 address = {Los Alamitos, CA, USA},
 author = {S. Andreotti and G. W. Klau and K. Reinert},
 journal = {IEEE/ACM Transactions on Computational Biology and Bioinformatics (TCBB) },
 month = {March},
 note = {doi: 10.1109/TCBB.2011.59},
 number = {2},
 pages = {385--394},
 publisher = {IEEE Computer Society Press },
 title = {Antilope - A Lagrangian Relaxation Approach to the de novo Peptide Sequencing Problem},
 url = {http://publications.imp.fu-berlin.de/1047/},
 volume = {9},
 year = {2012}
}

S. Aiche, K. Reinert, Ch. Schütte, D. Hildebrand, H. Schlüter, T. O. F. Conrad, “Inferring Proteolytic Processes from Mass Spectrometry Time Series Data Using Degradation Graphs”, vol. 7, iss. 7, 2012-07-17.

cite this publication

@article{fu_mi_publications1143,
 abstract = {Background: Proteases play an essential part in a variety of biological processes. Beside their importance
under healthy conditions they are also known to have a crucial role in complex diseases like cancer.
It was shown in the last years that not only the fragments produced by proteases but also their dynamics,
especially ex vivo, can serve as biomarkers. But so far, only a few approaches were taken to explicitly
model the dynamics of proteolysis in the context of mass spectrometry.

Results: We introduce a new concept model proteolytic processes, the degradation graph. The degra-
dation graph is an extension of the cleavage graph, a data structure to reconstruct and visualize the
proteolytic process. In contrast to previous approaches we extended the model to incorporate endoproteolytic
processes and present a method to construct a degradation graph from mass spectrometry
time-series data. Based on a degradation graph and the intensities extracted from the mass spectra it is
possible to estimate reaction rates of the underlying processes. We further suggest a score to rate different
degradation graphs in their ability to explain the observed data. This score is used in an iterative
heuristic to improve the structure of the initially constructed degradation graph.

Conclusion: We show that the proposed method is able to recover all degraded and generated
peptides, the underlying reactions, and the reaction rates of proteolytic processes based on mass spectrometry
time-series data. We use simulated and real data to demonstrate that a given process can be
reconstructed even in the presence of extensive noise, isobaric signals and false identications. While the
model is currently only validated on peptide data it is also applicable to proteins, as long as the necessary
time series data can be produced.},
 author = {S. Aiche and K. Reinert and Ch. Sch{\"u}tte and D. Hildebrand and H. Schl{\"u}ter and T. O. F. Conrad},
 journal = {PLoS ONE},
 month = {July},
 number = {7},
 pages = {e40656},
 publisher = {Public Library of Science},
 title = {Inferring Proteolytic Processes from Mass Spectrometry Time Series Data Using Degradation Graphs},
 url = {http://publications.imp.fu-berlin.de/1143/},
 volume = {7},
 year = {2012}
}

D. Weese, M. Holtgrewe, K. Reinert, “RazerS 3: Faster, fully sensitive read mapping”, vol. 28, iss. 20, 2012-08-24.

cite this publication

@article{fu_mi_publications1159,
 abstract = {Motivation: During the last years NGS sequencing has become a key technology for many applications in the biomedical sciences. Throughput continues to increase and new protocols provide longer reads than currently available. In almost all applications, read mapping is a first step. Hence, it is crucial to have algorithms and implementations that perform fast, with high sensitivity, and are able to deal with long reads and a large absolute number of indels.

Results: RazerS is a read mapping program with adjustable sensitivity based on counting q-grams. In this work we propose the successor RazerS 3 which now supports shared-memory parallelism, an additional seed-based filter with adjustable sensitivity, a much faster, banded version of the Myers? bit-vector algorithm for verification, memory saving measures and support for the SAM output format. This leads to a much improved performance for mapping reads, in particular long reads with many errors. We extensively compare RazerS 3 with other popular read mappers and show that its results are often superior to them in terms of sensitivity while exhibiting practical and often competetive run times. In addition, RazerS 3 works without a precomputed index.

Availability and Implementation: Source code and binaries are freely available for download at http://www.seqan.de/projects/razers. RazerS 3 is implemented in C++ and OpenMP under a GPL license using the SeqAn library and supports Linux, Mac OS X, and Windows.},
 author = {D. Weese and M. Holtgrewe and K. Reinert},
 journal = {Bioinformatics},
 month = {August},
 number = {20},
 pages = {2592--2599},
 publisher = {Oxford University Press},
 title = {RazerS 3: Faster, fully sensitive read mapping},
 url = {http://publications.imp.fu-berlin.de/1159/},
 volume = {28},
 year = {2012}
}

A.-K. Emde, M. H. Schulz, D. Weese, R. Sun, M. Vingron, V. M. Kalscheuer, S. A. Haas, K. Reinert, “Detecting genomic indel variants with exact breakpoints in single- and paired-end sequencing data using SplazerS”, vol. 28, iss. 5, 2012-01-11.

cite this publication

@article{fu_mi_publications1160,
 abstract = {Motivation: The reliable detection of genomic variation in resequencing data is still a major challenge, especially for variants larger than a few base pairs. Sequencing reads crossing boundaries of structural variation carry the potential for their identification, but are difficult to map.

Results: Here we present a method for ?split? read mapping, where prefix and suffix match of a read may be interrupted by a longer gap in the read-to-reference alignment. We use this method to accurately detect medium-sized insertions and long deletions with precise breakpoints in genomic resequencing data. Compared with alternative split mapping methods, SplazerS significantly improves sensitivity for detecting large indel events, especially in variant-rich regions. Our method is robust in the presence of sequencing errors as well as alignment errors due to genomic mutations/divergence, and can be used on reads of variable lengths. Our analysis shows that SplazerS is a versatile tool applicable to unanchored or single-end as well as anchored paired-end reads. In addition, application of SplazerS to targeted resequencing data led to the interesting discovery of a complete, possibly functional gene retrocopy variant.

Availability: SplazerS is available from http://www.seqan.de/projects/ splazers.},
 author = {A.-K. Emde and M. H. Schulz and D. Weese and R. Sun and M. Vingron and V. M. Kalscheuer and S. A. Haas and K. Reinert},
 journal = {Bioinformatics},
 month = {January},
 number = {5},
 pages = {619--627},
 publisher = {Oxford University Press},
 title = {Detecting genomic indel variants with exact breakpoints in single- and paired-end sequencing data using SplazerS},
 url = {http://publications.imp.fu-berlin.de/1160/},
 volume = {28},
 year = {2012}
}

B. Kehr, K. Reinert, A. E. Darling, B. Raphael, J. Tang, “Hidden Breakpoints in Genome Alignments ”, vol. 7534, 2012.

cite this publication

@incollection{fu_mi_publications1168,
 abstract = {During the course of evolution, an organism?s genome can undergo changes that affect the large-scale structure of the genome. These changes include gene gain, loss, duplication, chromosome fusion, fission, and rearrangement. When gene gain and loss occurs in addition to other types of rearrangement, breakpoints of rearrangement can exist that are only detectable by comparison of three or more genomes. An arbitrarily large number of these ?hidden? breakpoints can exist among genomes that exhibit no rearrangements in pairwise comparisons. 

We present an extension of the multichromosomal breakpoint median problem to genomes that have undergone gene gain and loss. We then demonstrate that the median distance among three genomes can be used to calculate a lower bound on the number of hidden breakpoints present. We provide an implementation of this calculation including the median distance, along with some practical improvements on the time complexity of the underlying algorithm. 

We apply our approach to measure the abundance of hidden breakpoints in simulated data sets under a wide range of evolutionary scenarios. We demonstrate that in simulations the hidden breakpoint counts depend strongly on relative rates of inversion and gene gain/loss. Finally we apply current multiple genome aligners to the simulated genomes, and show that all aligners introduce a high degree of error in hidden breakpoint counts, and that this error grows with evolutionary distance in the simulation. Our results suggest that hidden breakpoint error may be pervasive in genome alignments. },
 address = {Berlin},
 author = {B. Kehr and K. Reinert and A. E. Darling},
 booktitle = {Lecture Notes in Computer Science: Algorithms in Bioinformatics},
 editor = {B. Raphael and J. Tang},
 pages = {391--403},
 publisher = {Springer-Verlag},
 title = {Hidden Breakpoints in Genome Alignments },
 url = {http://publications.imp.fu-berlin.de/1168/},
 volume = {7534},
 year = {2012}
}

J. Junker, C. Bielow, A. Bertsch, M. Sturm, K. Reinert, O. Kohlbacher, “TOPPAS: a graphical workflow editor for the analysis of high-throughput proteomics data”, vol. 11, iss. 7, 2012-07-06.

cite this publication

@article{fu_mi_publications1396,
 abstract = {Mass spectrometry coupled to high-performance liquid chromatography (HPLC-MS) is evolving more quickly than ever. A wide range of different instrument types and experimental setups are commonly used. Modern instruments acquire huge amounts of data, thus requiring tools for an efficient and automated data analysis. Most existing software for analyzing HPLC-MS data is monolithic and tailored toward a specific application. A more flexible alternative consists of pipeline-based tool kits allowing the construction of custom analysis workflows from small building blocks, e.g., the Trans Proteomics Pipeline (TPP) or The OpenMS Proteomics Pipeline (TOPP). One drawback, however, is the hurdle of setting up complex workflows using command line tools. We present TOPPAS, The OpenMS Proteomics Pipeline ASsistant, a graphical user interface (GUI) for rapid composition of HPLC-MS analysis workflows. Workflow construction reduces to simple drag-and-drop of analysis tools and adding connections in between. Integration of external tools into these workflows is possible as well. Once workflows have been developed, they can be deployed in other workflow management systems or batch processing systems in a fully automated fashion. The implementation is portable and has been tested under Windows, Mac OS X, and Linux. TOPPAS is open-source software and available free of charge at http://www.OpenMS.de/TOPPAS .},
 author = {J. Junker and C. Bielow and A. Bertsch and M. Sturm and K. Reinert and O. Kohlbacher},
 journal = {J. Proteome Res.},
 month = {July},
 number = {7},
 pages = {3914--3920},
 publisher = {ACS Publications},
 title = {TOPPAS: a graphical workflow editor for the analysis of high-throughput proteomics data},
 url = {http://publications.imp.fu-berlin.de/1396/},
 volume = {11},
 year = {2012}
}

C. Bielow, “Quantification and Simulation of Liquid Chromatography-Mass Spectrometry Data”, 2012-10-29.

cite this publication

@phdthesis{fu_mi_publications1444,
 abstract = {Computational mass spectrometry is a fast evolving field that has attracted increased attention over the last couple of years.
The performance of software solutions determines the success of analysis to a great extent. New algorithms are required to reflect new experimental procedures and deal with new instrument generations.

One essential component of algorithm development is the validation (as well as comparison) of software on a broad range of
data sets. This requires a gold standard (or so-called ground truth), which is usually obtained by manual annotation of a real data set.
Comprehensive manually annotated public data sets for mass spectrometry data are labor-intensive to produce and their quality strongly depends on 
the skill of the human expert. Some parts of the data may even be impossible to annotate due to high levels of noise or other ambiguities.
Furthermore, manually annotated data is usually not available for all steps in a typical computational analysis pipeline.
We thus developed the most comprehensive simulation software to date, which allows to generate multiple levels of ground truth
and features a plethora of settings to reflect experimental conditions and instrument settings.
The simulator is used to generate several distinct types of data. The data are subsequently employed to evaluate existing algorithms.
Additionally, we employ simulation to determine the influence of instrument attributes and sample complexity on the ability of
algorithms to recover information. The results give valuable hints on how to optimize experimental setups.

Furthermore, this thesis introduces two quantitative approaches, namely a decharging algorithm based on integer linear programming 
and a new workflow for identification of differentially expressed proteins for a large in vitro study on toxic compounds.
Decharging infers the uncharged mass of a peptide (or protein) by clustering all its charge variants. The latter occur frequently under certain experimental conditions.
We employ simulation to show that decharging is robust against missing values even for high complexity data and that the algorithm outperforms
other solutions in terms of mass accuracy and run time on real data.
The last part of this thesis deals with a new state-of-the-art workflow for protein quantification based on isobaric tags for relative and absolute quantitation (iTRAQ). We devise
a new approach to isotope correction, propose an experimental design, introduce new metrics of iTRAQ data quality, and
confirm putative properties of iTRAQ data using a novel approach.

All tools developed as part of this thesis are implemented in OpenMS, a C++ library for computational mass spectrometry.},
 address = {Berlin, Germany},
 author = {C. Bielow},
 month = {October},
 school = {Freie Universit{\"a}t Berlin},
 title = {Quantification and Simulation of Liquid Chromatography-Mass Spectrometry Data},
 url = {http://publications.imp.fu-berlin.de/1444/},
 year = {2012}
}

Chris Bielow, “Quantification and simulation of liquid chromatography-mass spectrometry data”, 2012-11-16.

cite this publication

@phdthesis{fu_mi_publications2540,
 abstract = {Computational mass spectrometry is a fast evolving field that has attracted increased attention over the last couple of years. The performance of software solutions determines the success of analysis to a great extent. New algorithms are required to reflect new experimental procedures and deal with new instrument generations. One essential component of algorithm development is the validation (as well as comparison) of software on a broad range of data sets. This requires a gold standard (or so-called ground truth), which is usually obtained by manual annotation of a real data set. Comprehensive manually annotated public data sets for mass spectrometry data are labor- intensive to produce and their quality strongly depends on the skill of the human expert. Some parts of the data may even be impossible to annotate due to high levels of noise or other ambiguities. Furthermore, manually annotated data is usually not available for all steps in a typical computational analysis pipeline. We thus developed the most comprehensive simulation software to date, which allows to generate multiple levels of ground truth and features a plethora of settings to reflect experimental conditions and instrument settings. The simulator is used to generate several distinct types of data. The data are subsequently employed to evaluate existing algorithms. Additionally, we employ simulation to determine the influence of instrument attributes and sample complexity on the ability of algorithms to recover information. The results give valuable hints on how to optimize experimental setups. Furthermore, this thesis introduces two quantitative approaches, namely a decharging algorithm based on integer linear programming and a new workflow for identification of differentially expressed proteins for a large in vitro study on toxic compounds. Decharging infers the uncharged mass of a peptide (or protein) by clustering all its charge variants. The latter occur frequently under certain experimental conditions. We employ simulation to show that decharging is robust against missing values even for high complexity data and that the algorithm outperforms other solutions in terms of mass accuracy and run time on real data. The last part of this thesis deals with a new state-of-the-art workflow for protein quantification based on isobaric tags for relative and absolute quantitation (iTRAQ). We devise a new approach to isotope correction, propose an experimental design, introduce new metrics of iTRAQ data quality, and confirm putative properties of iTRAQ data using a novel approach. All tools developed as part of this thesis are implemented in OpenMS, a C++ library for computational mass spectrometry.},
 author = {Chris Bielow},
 month = {November},
 school = {Freie Universit{\"a}t Berlin},
 title = {Quantification and simulation of liquid chromatography-mass spectrometry data},
 url = {http://publications.imp.fu-berlin.de/2540/},
 year = {2012}
}

2011

C. Bielow, S. Aiche, S. Andreotti, K. Reinert, “MSSimulator: Simulation of Mass Spectrometry Data”, vol. 10, iss. 7, 2011-07.

cite this publication

@article{fu_mi_publications1066,
 abstract = {Mass spectrometry coupled to liquid chromatography (LC-MS and LC-MS/MS) is commonly used to analyze the protein content of biological samples in large scale studies, enabling quantitation and identification of proteins and peptides using a wide range of experimental protocols, algorithms, and statistical models to analyze the data. Currently it is difficult to compare the plethora of algorithms for these tasks. So far, curated benchmark data exists for peptide identification algorithms but data that represents a ground truth for the evaluation of LC-MS data is limited. Hence there have been attempts to simulate such data in a controlled fashion to evaluate and compare algorithms. We present MSSimulator, a simulation software for LC-MS and LC-MS/MS experiments. Starting from a list of proteins from a FASTA file,the simulation will perform in-silico digestion, retention time prediction,  ionization filtering, and raw signal simulation (including MS/MS), while providing many options to change the properties of the resultingdata like elution profile shape, resolution and sampling rate. Several protocols for SILAC, iTRAQ or MS(E) are available, in addition to the usual label-free approach, making MSSimulator the most comprehensive simulator for LC-MS and LC-MS/MS data.},
 author = {C. Bielow and S. Aiche and S. Andreotti and K. Reinert},
 journal = {Journal of Proteome Research},
 month = {July},
 number = {7},
 pages = {2922--2929},
 title = {MSSimulator: Simulation of Mass Spectrometry Data},
 url = {http://publications.imp.fu-berlin.de/1066/},
 volume = {10},
 year = {2011}
}

C. Bielow, C. Gröpl, O. Kohlbacher, K. Reinert, “Bioinformatics for qualitative and quantitative proteomics.”, vol. 719, 2011-01.

cite this publication

@incollection{fu_mi_publications1067,
 abstract = {Mass spectrometry is today a key analytical technique to elucidate the amount and content of proteins expressed in a certain cellular context. The degree of automation in proteomics has yet to reach that of genomic techniques, but even current technologies make a manual inspection of the data infeasible. This article addresses the key algorithmic problems bioinformaticians face when handling modern proteomic samples and shows common solutions to them. We provide examples on how algorithms can be combined to build relatively complex analysis pipelines, point out certain pitfalls and aspects worth considering and give a list of current state-of-the-art tools.},
 author = {C. Bielow and C. Gr{\"o}pl and O. Kohlbacher and K. Reinert},
 booktitle = {Bioinformatics for Omics Data Methods and Protocols},
 journal = {Methods in molecular biology (Clifton, N.J.)},
 month = {January},
 pages = {331--49},
 publisher = {Humana Press},
 title = {Bioinformatics for qualitative and quantitative proteomics.},
 url = {http://publications.imp.fu-berlin.de/1067/},
 volume = {719},
 year = {2011}
}

M. Holtgrewe, A.-K. Emde, D. Weese, K. Reinert, “A Novel And Well-Defined Benchmarking Method For Second Generation Read Mapping”, vol. 12, iss. 120, 2011-05-26.

cite this publication

@article{fu_mi_publications1072,
 abstract = {Background
Second generation sequencing technologies yield DNA sequence data at ultra high-throughput. Common to most biological applications is a mapping of the reads to an almost identical or highly similar reference genome. The assessment of the quality of read mapping results is not straightforward and has not been formalized so far. Hence, it has not been easy to compare different read mapping approaches in a unified way and to determine which program is the best for what task.

Results
We present a new benchmark method, called Rabema (Read Alignment BEnchMArk), for read mappers. It consists of a strict definition of the read mapping problem and of tools to evaluate the result of arbitrary read mappers supporting the SAM output format.

Conclusions
We show the usefulness of the benchmark program by performing a comparison of popular read mappers. The tools supporting the benchmark are licensed under the GPL and available from http://www.seqan.de/projects/rabema.html.},
 author = {M. Holtgrewe and A.-K. Emde and D. Weese and K. Reinert},
 journal = {BMC Bioinformatics},
 month = {May},
 number = {120},
 publisher = {BioMed Central},
 title = {A Novel And Well-Defined Benchmarking Method For Second Generation Read Mapping},
 url = {http://publications.imp.fu-berlin.de/1072/},
 volume = {12},
 year = {2011}
}

B. Kehr, D. Weese, K. Reinert, “STELLAR: fast and exact local alignments”, vol. 12, iss. S9, 2011-10-03.

cite this publication

@article{fu_mi_publications1092,
 abstract = {Background
Large-scale comparison of genomic sequences requires reliable tools for the search of local alignments. Practical local aligners are in general fast, but heuristic, and hence sometimes miss significant matches.

Results
We present here the local pairwise aligner STELLAR that has full sensitivity for {\ensuremath{\epsilon}}-alignments, i.e. guarantees to report all local alignments of a given minimal length and maximal error rate. The aligner is composed of two steps, filtering and verification. We apply the SWIFT algorithm for lossless filtering, and have developed a new verification strategy that we prove to be exact. Our results on simulated and real genomic data confirm and quantify the conjecture that heuristic tools like BLAST or BLAT miss a large percentage of significant local alignments.

Conclusions
STELLAR is very practical and fast on very long sequences which makes it a suitable new tool for finding local alignments between genomic sequences under the edit distance model. Binaries are freely available for Linux, Windows, and Mac OS X at http://www.seqan.de/projects/stellar. The source code is freely distributed with the SeqAn C++ library version 1.3 and later at http://www.seqan.de.},
 author = {B. Kehr and D. Weese and K. Reinert},
 journal = {BMC Bioinformatics},
 month = {October},
 number = {S9},
 pages = {S15},
 publisher = {BioMed Central},
 title = {STELLAR: fast and exact local alignments},
 url = {http://publications.imp.fu-berlin.de/1092/},
 volume = {12},
 year = {2011}
}

S. Böcker, B. Kehr, F. Rasche, “Determination of Glycan Structure from Tandem Mass Spectra”, vol. 8, iss. 4, 2011-07.

cite this publication

@article{fu_mi_publications1132,
 abstract = {Glycans are molecules made from simple sugars that form complex tree structures. Glycans constitute one of the most important protein modifications and identification of glycans remains a pressing problem in biology. Unfortunately, the structure of glycans is hard to predict from the genome sequence of an organism. In this paper, we consider the problem of deriving the topology of a glycan solely from tandem mass spectrometry (MS) data. We study, how to generate glycan tree candidates that sufficiently match the sample mass spectrum, avoiding the combinatorial explosion of glycan structures. Unfortunately, the resulting problem is known to be computationally hard. We present an efficient exact algorithm for this problem based on fixed-parameter algorithmics that can process a spectrum in a matter of seconds. We also report some preliminary results of our method on experimental data, combining it with a preliminary candidate evaluation scheme. We show that our approach is fast in applications, and that we can reach very well de novo identification results. Finally, we show how to count the number of glycan topologies for a fixed size or a fixed mass. We generalize this result to count the number of (labeled) trees with bounded out degree, improving on results obtained using P{\'o}lya's enumeration theorem.},
 author = {S. B{\"o}cker and B. Kehr and F. Rasche},
 journal = {IEEE/ACM Transactions on Computational Biology and Bioinformatics },
 month = {July},
 number = {4},
 pages = {976--986},
 publisher = {IEEE computer society, TCBB},
 title = {Determination of Glycan Structure from Tandem Mass Spectra},
 url = {http://publications.imp.fu-berlin.de/1132/},
 volume = {8},
 year = {2011}
}

T. Rausch, K. Reinert, L. S. Heath, N. Ramakrishnan, “Practical multiple Sequence alignment”, p. 347, 2011.

cite this publication

@incollection{fu_mi_publications393,
 abstract = {Abstract Multiple sequence alignment as a means of comparing DNA, RNA or amino acid sequences is an essential precondition for various analyses, including structure prediction, modeling binding sites, phylogeny or function prediction. This range of applications implies a demand for versatile, flexible and specialized meth- ods to compute accurate alignments. This chapter summarizes the key algorithmic insights gained in the past years to facilitate both, an easy understanding of the current multiple sequence alignment literature and to enable the readers to use and apply current tools in their own everyday research.},
 author = {T. Rausch and K. Reinert},
 booktitle = {Problem Solving Handbook in Computational Biology and Bioinformatics},
 editor = {L. S. Heath and N. Ramakrishnan},
 pages = {21--43},
 publisher = {Springer Science+Business Media},
 title = {Practical multiple Sequence alignment},
 url = {http://publications.imp.fu-berlin.de/393/},
 year = {2011}
}

2010

Tobias Rausch, “Dissecting multiple sequence alignment methods”, 2010-06-08.

cite this publication

@phdthesis{fu_mi_publications2522,
 abstract = {Multiple sequence alignments are an indispensable tool in bioinformatics. Many applications rely on accurate multiple alignments, including protein structure prediction, phylogeny and the modeling of binding sites. In this thesis we dissected and analyzed the crucial algorithms and data structures required to construct such a multiple alignment. Based upon that dissection, we present a novel graph-based multiple sequence alignment program and a new method for multi-read alignments occurring in assembly projects. The advantage of the graph-based alignment is that a single vertex can represent a single character, a large segment or even an abstract entity such as a gene. This gives rise to the opportunity to apply the consistency-based progressive alignment paradigm to alignments of genomic sequences. The proposed multi-read alignment method outperforms similar methods in terms of alignment quality and it is apparently one of the first methods that can readily be used for insert sequencing. An important aspect of this thesis was the design, the development and the integration of the essential multiple sequence alignment components in the SeqAn library. SeqAn is a software library for sequence analysis that provides the core algorithmic components required to analyze large-scale sequence data. SeqAn aims at bridging the current gap between algorithm theory and available practical implementations in bioinformatics. Hence, we always describe in conjunction to the theoretical development of the methods, the actual implementation of the data structures and algorithms in order to strengthen the use of SeqAn as an experimental platform for rapidly developing and testing applications. All presented methods are part of the open source SeqAn library that can be downloaded from our website, www.seqan.de.},
 author = {Tobias Rausch},
 month = {June},
 school = {Freie Universit{\"a}t Berlin},
 title = {Dissecting multiple sequence alignment methods},
 url = {http://publications.imp.fu-berlin.de/2522/},
 year = {2010}
}

A.-K. Emde, M. Grunert, D. Weese, K. Reinert, S. R. Sperling, “MicroRazerS: Rapid alignment of small RNA reads”, vol. 26, iss. 1, 2010-01-01.

cite this publication

@article{fu_mi_publications792,
 abstract = {Motivation: Deep sequencing has become the method of choice for determining the small RNA content of a cell. Mapping the sequenced reads onto their reference genome serves as the basis for all further analyses, namely for identification and quantification. A method frequently used is Mega BLAST followed by several filtering steps, even though it is slow and inefficient for this task. Also, none of the currently available short read aligners has established itself for the particular task of small RNA mapping.  Results: We present MicroRazerS, a tool optimized for mapping small RNAs onto a reference genome. It is an order of magnitude faster than Mega BLAST and comparable in speed to other short read mapping tools. In addition, it is more sensitive and easy to handle and adjust.  Availability: MicroRazerS is part of the SeqAn C++ library and can be downloaded from http://www.seqan.de/projects/MicroRazerS.html.  Contact: emde@inf.fu-berlin.de, grunert@molgen.mpg.de},
 author = {A.-K. Emde and M. Grunert and D. Weese and K. Reinert and S. R. Sperling},
 journal = {Bioinformatics},
 month = {January},
 number = {1},
 pages = {123--124},
 publisher = {Oxford University Press},
 title = {MicroRazerS: Rapid alignment of small RNA reads},
 url = {http://publications.imp.fu-berlin.de/792/},
 volume = {26},
 year = {2010}
}

A. Gogol-Döring, K. Reinert, “Biological Sequence Analysis using the SeqAn C++ Library”, iss. 1, p. 311, 2010.

cite this publication

@book{fu_mi_publications825,
 abstract = {Before the SeqAn project, there was clearly a lack of available implementations in sequence analysis, even for standard tasks. Implementations of needed algorithmic components were either unavailable or hard to access in third-party monolithic software products. Addressing these concerns, the developers of SeqAn created a comprehensive, easy-to-use, open source C++ library of efficient algorithms and data structures for the analysis of biological sequences. Written by the founders of this project, Biological Sequence Analysis Using the SeqAn C++ Library covers the SeqAn library, its documentation, and the supporting infrastructure.


The first part of the book describes the general library design. It introduces biological sequence analysis problems, discusses the benefit of using software libraries, summarizes the design principles and goals of SeqAn, details the main programming techniques used in SeqAn, and demonstrates the application of these techniques in various examples. Focusing on the components provided by SeqAn, the second part explores basic functionality, sequence data structures, alignments, pattern and motif searching, string indices, and graphs. The last part illustrates applications of SeqAn to genome alignment, consensus sequence in assembly projects, suffix array construction, and more.


This handy book describes a user-friendly library of efficient data types and algorithms for sequence analysis in computational biology. SeqAn enables not only the implementation of new algorithms, but also the sound analysis and comparison of existing algorithms.},
 address = {Boca Raton, USA},
 author = {A. Gogol-D{\"o}ring and K. Reinert},
 number = {1},
 publisher = {CRC Press},
 series = {Chapman \& Hall/CRC Mathematical \& Computational Biology },
 title = {Biological Sequence Analysis using the SeqAn C++ Library},
 url = {http://publications.imp.fu-berlin.de/825/},
 year = {2010}
}

C. Bielow, S. Ruzek, C. Huber, K. Reinert, “Optimal Decharging and Clustering of Charge Ladders Generated in ESI−MS”, vol. 9, iss. 5, 2010-03-04.

cite this publication

@article{fu_mi_publications895,
 abstract = {In electrospray ionization mass spectrometry (ESI?MS), peptide and protein ions are usually observed in multiple charge states. Moreover, adduction of the multiply charged species with other ions frequently results in quite complex signal patterns for a single analyte, which significantly complicates the derivation of quantitative information from the mass spectra. Labeling strategies targeting the MS1 level further aggravate this situation, as multiple biological states such as healthy or diseased must be represented simultaneously. We developed an integer linear programming (ILP) approach, which can cluster signals belonging to the same peptide or protein. The algorithm is general in that it models all possible shifts of signals along the m/z axis. These shifts can be induced by different charge states of the compound, the presence of adducts (e.g., potassium or sodium), and/or a fixed mass label (e.g., from ICAT or nicotinic acid labeling), or any combination of the above. We show that our approach can be used to infer more features in labeled data sets, correct wrong charge assignments even in high-resolution MS, improve mass precision, and cluster charged species in different charge states and several adduct types.},
 author = {C. Bielow and S. Ruzek and C. Huber and K. Reinert},
 journal = {J. Proteome Res.},
 month = {March},
 note = {online publication complete, printed publication to be expected},
 number = {5},
 pages = {2688--2695},
 publisher = {ACS Publications},
 title = {Optimal Decharging and Clustering of Charge Ladders Generated in ESI?MS},
 url = {http://publications.imp.fu-berlin.de/895/},
 volume = {9},
 year = {2010}
}

M. Birn, M. Holtgrewe, P. Sanders, J. Singler, “Simple and Fast Nearest Neighbor Search”, 2010.

cite this publication

@article{fu_mi_publications926,
 abstract = {We present a simple randomized data structure for two-dimensional point sets that allows fast nearest neighbor queries in many cases. An implementation outperforms several previous implementations for commonly used benchmarks.},
 author = {M. Birn and M. Holtgrewe and P. Sanders and J. Singler},
 journal = {2010 Proceedings of the Twelfth Workshop on Algorithm Engineering and Experiments (ALENEX)},
 pages = {43--54},
 publisher = {ACM SIAM},
 title = {Simple and Fast Nearest Neighbor Search},
 url = {http://publications.imp.fu-berlin.de/926/},
 year = {2010}
}

M. Holtgrewe, P. Sanders, C. Schulz, “Engineering a scalable high quality graph partitioner”, 2010-04.

cite this publication

@article{fu_mi_publications930,
 abstract = {We describe an approach to parallel graph partitioning that scales to hundreds of processors and produces a high solution quality. For example, for many instances from Walshaw's benchmark collection we improve the best known partitioning. We use the well known framework of multi-level graph partitioning. All components are implemented by scalable parallel algorithms. Quality improvements compared to previous systems are due to better prioritization of edges to be contracted, better approximation algorithms for identifying matchings, better local search heuristics, and perhaps most notably, a parallelization of the FM local search algorithm that works more locally than previous approaches.},
 author = {M. Holtgrewe and P. Sanders and C. Schulz},
 journal = {2010 IEEE International Symposium on Parallel \& Distributed Processing (IPDPS)},
 month = {April},
 pages = {1--12},
 title = {Engineering a scalable high quality graph partitioner},
 url = {http://publications.imp.fu-berlin.de/930/},
 year = {2010}
}

D. Hüser, A. Gogol-Döring, T. Lutter, S. Weger, K. Winter, E.-M. Hammer, T. Cathomen, K. Reinert, R. Heilbronn, “Integration Preferences of Wildtype AAV-2 for Consensus Rep-Binding Sites at Numerous Loci in the Human Genome”, vol. 6, iss. 7, 2010-07-08.

cite this publication

@article{fu_mi_publications935,
 abstract = {Adeno-associated virus type 2 (AAV) is known to establish latency by preferential integration in human chromosome 19q13.42. The AAV non-structural protein Rep appears to target a site called AAVS1 by simultaneously binding to Rep-binding sites (RBS) present on the AAV genome and within AAVS1. In the absence of Rep, as is the case with AAV vectors, chromosomal integration is rare and random. For a genome-wide survey of wildtype AAV integration a linker-selection-mediated (LSM)-PCR strategy was designed to retrieve AAV-chromosomal junctions. DNA sequence determination revealed wildtype AAV integration sites scattered over the entire human genome. The bioinformatic analysis of these integration sites compared to those of rep-deficient AAV vectors revealed a highly significant overrepresentation of integration events near to consensus RBS. Integration hotspots included AAVS1 with 10\% of total events. Novel hotspots near consensus RBS were identified on chromosome 5p13.3 denoted AAVS2 and on chromsome 3p24.3 denoted AAVS3. AAVS2 displayed seven independent junctions clustered within only 14 bp of a consensus RBS which proved to bind Rep in vitro similar to the RBS in AAVS3. Expression of Rep in the presence of rep-deficient AAV vectors shifted targeting preferences from random integration back to the neighbourhood of consensus RBS at hotspots and numerous additional sites in the human genome. In summary, targeted AAV integration is not as specific for AAVS1 as previously assumed. Rather, Rep targets AAV to integrate into open chromatin regions in the reach of various, consensus RBS homologues in the human genome.},
 author = {D. H{\"u}ser and A. Gogol-D{\"o}ring and T. Lutter and S. Weger and K. Winter and E.-M. Hammer and T. Cathomen and K. Reinert and R. Heilbronn},
 journal = {PLoS Pathogens},
 month = {July},
 number = {7},
 pages = {e1000985},
 title = {Integration Preferences of Wildtype AAV-2 for Consensus Rep-Binding Sites at Numerous Loci in the Human Genome},
 url = {http://publications.imp.fu-berlin.de/935/},
 volume = {6},
 year = {2010}
}

M. Holtgrewe, “Mason – A Read Simulator for Second Generation Sequencing Data”, 2010-10.

cite this publication

@article{fu_mi_publications962,
 abstract = {We present a read simulator software for Illumina, 454 and Sanger reads. Its features include position specific error rates and base quality values. For Illumina reads, we give a comprehensive analysis with empirical data for the error and quality model. For the other technologies, we use models from the literature. It has been written with performance in mind and can sample reads from large genomes. The C++ source code is extensible, and freely available under the GPL/LGPL.},
 author = {M. Holtgrewe},
 journal = {Technical Report FU Berlin},
 month = {October},
 title = {Mason ? A Read Simulator for Second Generation Sequencing Data},
 url = {http://publications.imp.fu-berlin.de/962/},
 year = {2010}
}

2009

Ole Schulz-Trieglaff, “Computational methods for Quantitative Peptide Mass Spectrometry”, 2009-08-05.

cite this publication

@phdthesis{fu_mi_publications2534,
 abstract = {This thesis presents algorithms for the analysis of liquid chromatography-mass spectrometry (LC-MS) data. Mass spectrometry is a technology that can be used to determine the identities and abundances of the compounds in complex samples. In combination with liquid chromatography, it has become a popular method in the field of proteomics, the large-scale study of proteins and peptides in living systems. This area of research has gained a lot of interest in recent years since proteins control fundamental reactions in the cell. Consequently, a deeper knowledge of their function is expected to be crucial for the development of new drugs and the cure of diseases. The data sets obtained from an LC-MS experiment are large and highly complex. The outcome of such an experiment is called an LC-MS map. The map is a collection of mass spectra. They contain, among the signals of interest, a high amount of noise and other disturbances. That is why algorithms for the low-level processing of LC-MS data are becoming increasingly important. These algorithms are the focus of this text. Our novel contributions are threefold: first, we introduce SweepWavelet, an algorithm for the efficient detection and quantification of peptides from LC-MS data. The quantification of proteins and peptides using mass spectrometry is of high interest for biomedical research but also for the pharmaceutical industry since it is usually among the first steps in an LC-MS data analysis pipeline and all subsequent steps depend on its quality. Our approach was among the first to address this problem in a sound computational framework. It consists of three steps: first, we apply a tailored wavelet function that filters mass spectra for the isotope peaks of peptides. Second, we use a method inspired by the sweep-line paradigm which makes use of the redundant information in LC-MS data to determine mass, charge, retention time and abundance of all peptides. Finally, we apply a flexible peptide signal model to filter the extracted signals for false positives. The second part of this thesis deals with the benchmarking of LC-MS signal detection algorithms. This is a non-trivial task since it is difficult to establish a ground truth using real world samples: which sample compounds become visible in an LC-MS data set is not known in advance. To this end, we use annotated data and simulations to assess the performance of currently available algorithms. To simulate benchmark data, we developed a simulation software called LC-MSsim. It incorporates computational models for retention time prediction, peptide detectability, isotope pattern and elution peaks. Using this software, we can simulate all steps in an LC-MS experiment and obtain a list with the positions, charges and abundances of all peptide signals contained in the resulting LC-MS map. This gives us a ground truth against which we can match the results of a signal detection algorithm. In this thesis, we use it for the benchmarking of quantification algorithms but its scope is wider and it can also be used to evaluate other algorithms. To our knowledge, LC-MSsim is the first software that can simulate the full LC-MS data acquisition process. The third contribution of this thesis is a statistical framework for the quality assessment of quantitative LC-MS experiments. Whereas quality assessment and control are already widespread in the field of gene expression analysis, our work is the first to address this problem for LCMS data. We use methods from robust statistics to detect outlier LC-MS maps in large-scale quantitative experiments. Our approach introduces the notion of quality descriptors to derive an abstract representation of an LC-MS map and applies a robust principal component analysis based on projection pursuit. We show that it is sensible to use robust statistics for this problem and evaluate our method on simulated maps and on data from three real-world LC-MS studies.},
 author = {Ole Schulz-Trieglaff},
 month = {August},
 school = {Freie Universit{\"a}t Berlin},
 title = {Computational methods for Quantitative Peptide Mass Spectrometry},
 url = {http://publications.imp.fu-berlin.de/2534/},
 year = {2009}
}

Andreas Gogol-Döring, “SeqAn”, 2009-11-26.

cite this publication

@phdthesis{fu_mi_publications2542,
 abstract = {SeqAn is a library of efficient algorithms and data structures for sequence analysis, which means processing large amounts of biomedical data like DNA or proteine sequences. The library was developed for two groups of users: Software engineers can use it for the implementation of new software tools. Such tools are essential for biological and medical research. Algorithm designers may also use the library as a platform for the development, testing and comparison of algorithms. The project therefore contributes to bioinformatics engineering with the eventual purpose to promote the scientific research in life science.},
 author = {Andreas Gogol-D{\"o}ring},
 month = {November},
 school = {Freie Universit{\"a}t Berlin},
 title = {SeqAn},
 url = {http://publications.imp.fu-berlin.de/2542/},
 year = {2009}
}

T. Rausch, S. Koren, G. Denisov, D. Weese, A.-K. Emde, A. Döring, K. Reinert, “A consistency-based consensus algorithm for de novo and reference-guided sequence assembly of short reads”, vol. 25, iss. 9, 2009.

cite this publication

@article{fu_mi_publications392,
 abstract = {Motivation: Novel high-throughput sequencing technologies pose new algorithmic challenges in handling massive amounts of short- read, high-coverage data. A robust and versatile consensus tool is of particular interest for such data since a sound multi-read alignment is a prerequisite for variation analyses, accurate genome assemblies and insert sequencing. Results: A multi-read alignment algorithm for de novo or reference- guided genome assembly is presented. The program identifies segments shared by multiple reads and then aligns these segments using a consistency-enhanced alignment graph. On real de novo sequencing data obtained from the newly established NCBI Short Read Archive, the program performs similarly in quality to other comparable programs. On more challenging simulated datasets for insert sequencing and variation analyses, our program outperforms the other tools. Availability: The consensus program can be downloaded from http://www.seqan.de/projects/consensus.html. It can be used stand- alone or in conjunction with the Celera Assembler. Both application scenarios as well as the usage of the tool are described in the documentation. Contact: rausch@inf.fu-berlin.de},
 author = {T. Rausch and S. Koren and G. Denisov and D. Weese and A.-K. Emde and A. D{\"o}ring and K. Reinert},
 journal = {Bioinformatics},
 number = {9},
 pages = {1118--1124},
 publisher = {Oxford University Press},
 title = {A consistency-based consensus algorithm for de novo and reference-guided    sequence assembly of short reads},
 url = {http://publications.imp.fu-berlin.de/392/},
 volume = {25},
 year = {2009}
}

R. A. Bauer, K. Rother, P. Moor, K. Reinert, T. Steinke, J. M. Bujnicki, R. Preissner, “Fast Structural Alignment of Biomolecules Using a Hash Table, N-Grams and String Descriptors”, vol. 2, iss. 2, 2009.

cite this publication

@article{fu_mi_publications450,
 abstract = {This work presents a generalized approach for the fast structural alignment 
of thousands of macromolecular structures. The method uses string representations of a macromolecular structure and a hash table that stores n-grams of a certain size for searching. 
To this end, macromolecular structure-to-string translators were implemented for protein and RNA structures. A query against the index is performed in two hierarchical steps to unite speed and precision. In the ?rst step the query structure is translated into n-grams, and all target structures containing these n-grams are retrieved from the hash table. In the second 
step all corresponding n-grams of the query and each target structure are subsequently aligned, and after each alignment a score is calculated based on the matching n-grams of query and target. The extendable framework enables the user to query and structurally align thousands of protein and RNA structures on a commodity machine and is available as 
open source from http://la jolla.sf.net. },
 author = {R. A. Bauer and K. Rother and P. Moor and K. Reinert and T. Steinke and J. M. Bujnicki and R. Preissner},
 journal = {Algorithms and Molecular Sciences},
 number = {2},
 pages = {692--709},
 publisher = {MDPI},
 title = {Fast Structural Alignment of Biomolecules Using a Hash Table, N-Grams and String Descriptors},
 url = {http://publications.imp.fu-berlin.de/450/},
 volume = {2},
 year = {2009}
}

D. Weese, A.-K. Emde, T. Rausch, A. Döring, K. Reinert, “RazerS - Fast Read Mapping with Sensitivity Control”, vol. 19, iss. 9, 2009-07-10.

cite this publication

@article{fu_mi_publications453,
 abstract = {Second-generation sequencing technologies deliver DNA sequence data at unprecedented high throughput. Common to most biological applications is a mapping of the reads to an almost identical or highly similar reference genome. Due to the large amounts of data, e?cient algorithms and implementations are crucial for this task. We present an e?cient read mapping tool called RazerS. It allows the user to align sequencing reads of arbitrary length using either the Hamming distance or the edit distance. Our tool can work either lossless or with a user-de?ned loss rate at higher speeds. Given the loss rate, we present an approach that guarantees not to lose more reads than speci?ed. This enables the user to adapt to the problem at hand and provides a seamless tradeo? between sensitivity and running time.},
 author = {D. Weese and A.-K. Emde and T. Rausch and A. D{\"o}ring and K. Reinert},
 journal = {Genome Research},
 month = {July},
 number = {9},
 pages = {1646--1654},
 title = {RazerS - Fast Read Mapping with Sensitivity Control},
 url = {http://publications.imp.fu-berlin.de/453/},
 volume = {19},
 year = {2009}
}

A. Zerck, E. Nordhoff, A. Reseman, E. Mirgorodskaya, D. Suckau, K. Reinert, H. Lehrach, J. Gobom, “An iterative strategy for precursor ion selection for LC-MS/MS based shotgun proteomics”, 2009-04-30.

cite this publication

@article{fu_mi_publications456,
 abstract = {Currently, the precursor ion selection strategies in LC-MS mainly choose the most prominent peptide signals for MS/MS analysis. Consequently, high abundant proteins are identified by MS/MS of many peptides whereas proteins of lower abundance might elude identification. We present a novel, iterative and result-driven approach for precursor ion selection that significantly increases the efficiency of an MS/MS analysis by decreasing data redundancy and analysis time. By simulating different strategies for precursor ion selection on an existing dataset we compare our method to existing result-driven strategies and evaluate its performance with regard to mass accuracy, database size, and sample complexity.},
 author = {A. Zerck and E. Nordhoff and A. Reseman and E. Mirgorodskaya and D. Suckau and K. Reinert and H. Lehrach and J. Gobom},
 journal = {Journal of Proteome Research},
 month = {April},
 publisher = {American Chemical Society},
 title = {An iterative strategy for precursor ion selection for LC-MS/MS based shotgun proteomics},
 url = {http://publications.imp.fu-berlin.de/456/},
 year = {2009}
}

2008

Eva Lange, “Analysis of mass spectrometric data - peak picking and map alignment”, 2008-09-09.

cite this publication

@phdthesis{fu_mi_publications2528,
 abstract = {We study two fundamental processing steps in mass spectrometric data analysis from a theoretical and practical point of view. For the detection and extraction of mass spectral peaks we developed an efficient peak picking algorithm that is independent of the underlying machine or ionization method, and is able to resolve highly convoluted and asymmetric signals. The method uses the multiscale nature of spectrometric data by first detecting the mass peaks in the wavelet-transformed signal before a given asymmetric peak function is fitted to the raw data. In two optional stages, highly overlapping peaks can be separated or all peak parameters can be further improved using techniques from nonlinear optimization. In contrast to currently established techniques, our algorithm is able to separate overlapping peaks of multiply charged peptides in LC-ESI-MS data of low resolution. Furthermore, applied to high-quality MALDI-TOF spectra it yields a high degree of accuracy and precision and compares very favorably with the algorithms supplied by the vendor of the mass spectrometers. On the high-resolution MALDI spectra as well as on the low-resolution LC-MS data set, our algorithm achieves a fast runtime of only a few seconds. Another important processing step that can be found in every typical protocol for labelfree quantification is the combination of results from multiple LC-MS experiments to improve confidence in the obtained measurements or to compare results from different samples. To do so, a multiple alignment of the LC-MS maps needs to be estimated. The alignment has to correct for variations in mass and elution time which are present in all mass spectrometry experiments. For the first time we formally define the multiple LC-MS raw and feature map alignment problem using our own distance function for LC-MS maps. Furthermore, we present a solution to this problem. Our novel algorithm aligns LC-MS samples and matches corresponding ion species across samples. In a first step, it uses an adapted pose clustering approach to efficiently superimpose raw maps as well as feature maps. This is done in a star-wise manner, where the elements of all maps are transformed onto the coordinate system of a reference map. To detect and combine corresponding features in multiple feature maps into a so-called consensus map, we developed an additional step based on techniques from computational geometry. We show that our alignment approach is fast and reliable as compared to five other alignment approaches. Furthermore, we prove its robustness in the presence of noise and its ability to accurately align samples with only few common ion species.},
 author = {Eva Lange},
 month = {September},
 school = {Freie Universit{\"a}t Berlin},
 title = {Analysis of mass spectrometric data - peak picking and map alignment},
 url = {http://publications.imp.fu-berlin.de/2528/},
 year = {2008}
}

T. O. F. Conrad, “New statistical algorithms for the analysis of mass spectrometry time-of- flight mass data with applications in clinical diagnostics”, 2008-10-01.

cite this publication

@phdthesis{fu_mi_publications2543,
 abstract = {Mass spectrometry (MS) based techniques have emerged as a standard for large- scale protein analysis. The ongoing progress in terms of more sensitive machines and improved data analysis algorithms led to a constant expansion of its fields of applications. Recently, MS was introduced into clinical proteomics with the prospect of early disease detection using proteomic pattern matching. Analyzing biological samples (e.g. blood) by mass spectrometry generates mass spectra that represent the components (molecules) contained in a sample as masses and their respective relative concentrations. In this work, we are interested in those components that are constant within a group of individuals but differ much between individuals of two distinct groups. These distinguishing components that dependent on a particular medical condition are generally called biomarkers. Since not all biomarkers found by the algorithms are of equal (discriminating) quality we are only interested in a small biomarker subset that - as a combination - can be used as a fingerprint for a disease. Once a fingerprint for a particular disease (or medical condition) is identified, it can be used in clinical diagnostics to classify unknown spectra. In this thesis we have developed new algorithms for automatic extraction of disease specific fingerprints from mass spectrometry data. Special emphasis has been put on designing highly sensitive methods with respect to signal detection. Thanks to our statistically based approach our methods are able to detect signals even below the noise level inherent in data acquired by common MS machines, such as hormones. To provide access to these new classes of algorithms to collaborating groups we have created a web-based analysis platform that provides all necessary interfaces for data transfer, data analysis and result inspection. To prove the platform's practical relevance it has been utilized in several clinical studies two of which are presented in this thesis. In these studies it could be shown that our platform is superior to commercial systems with respect to fingerprint identification. As an outcome of these studies several fingerprints for different cancer types (bladder, kidney, testicle, pancreas, colon and thyroid) have been detected and validated. The clinical partners in fact emphasize that these results would be impossible with a less sensitive analysis tool (such as the currently available systems). In addition to the issue of reliably finding and handling signals in noise we faced the problem to handle very large amounts of data, since an average dataset of an individual is about 2.5 Gigabytes in size and we have data of hundreds to thousands of persons. To cope with these large datasets, we developed a new framework for a heterogeneous (quasi) ad-hoc Grid - an infrastructure that allows to integrate thousands of computing resources (e.g. Desktop Computers, Computing Clusters or specialized hardware, such as IBM's Cell Processor in a Playstation 3).},
 author = {T. O. F. Conrad},
 month = {October},
 school = {Freie Universit{\"a}t Berlin},
 title = {New statistical algorithms for the analysis of mass spectrometry time-of- flight mass data with applications in clinical diagnostics},
 url = {http://publications.imp.fu-berlin.de/2543/},
 year = {2008}
}

Markus Johann Bauer, “A combinatorial approach to RNA sequence-structure alignments”, 2008-07-30.

cite this publication

@phdthesis{fu_mi_publications2545,
 abstract = {Until a couple of years ago the scientific mainstream held that genetic information, stored as DNA strands, is transcribed to RNA, and RNA sequences are in turn translated to proteins, the actual functional units in the cell. RNA was generally believed to be a helper molecule in the cell until the beginning of the new millennium. This view changed. We see the potential of RNA as one of the key cellular players. In this thesis we present a novel framework for computing sequence-structure alignments of RNA sequences. Our contribution is twofold: first, we give a graph-theoretic model for the computation of multiple sequence-structure alignments. We phrase the model as an integer linear program (ILP) and show how we can relax the ILP such that we are able to compute optimal or near-optimal solutions for the original problem. In a subsequent step, we augment the initial model with stacking energies. Stacking base pairs greatly contribute to the energetic stability of the overall structure and should therefore be additionally rewarded. We extend the original ILP such that stacking energies are incorporated. Second, we give extensive computational results on real data from the RFAM database. We compare the performance of truly multiple sum-of-pairs sequence-structure alignments to heuristic sequence-structure alignments. We show that the objective function value of the sum-of-pairs model is generally higher compared to the heuristically inferred alignments. At the same time, we sketch the computational limits for the sum-of-pairs multiple sequence-structure model. The computational costs for computing exact multiple sequence-structure alignments are generally very high. To validate our approach on a larger test set, we run two implementations that take two sequences as their input. LaRA and SLaRA---based on the initial and the stack model---compute all pairwise sequence-structure alignments and use the external program TCOFFEE to infer a consistency-based multiple sequence-structure alignment. Additionally, we run the progressive versions PLaRA and PSLaRA on the same input data set. Our experiments on the BRAliBase benchmark set show that our tools are top-ranked for all input classes. Furthermore, our implementations need less running time compared to similar approaches. Subsequently, we compare two different algorithms for computing the optimal value of the Lagrangian dual and show that in our test setting the conceptually easier subgradient method is superior to the bundle method. Finally, we incorporate our Lagrangian relaxation approach into a branch-and-bound framework. We show for which instances we are able to compute provably optimal solutions and compare our results with previously published results of a branch-and-bound approach for the related quadratic knapsack problem.},
 author = {Markus Johann Bauer},
 month = {July},
 school = {Freie Universit{\"a}t Berlin},
 title = {A combinatorial approach to RNA sequence-structure alignments},
 url = {http://publications.imp.fu-berlin.de/2545/},
 year = {2008}
}

Abha Singh Bais, “Annotated Alignments”, 2008-08-01.

cite this publication

@phdthesis{fu_mi_publications2851,
 abstract = {Elucidating the mechanisms of transcriptional regulation relies heavily on the sequence annotation of the binding sites of DNA-binding proteins called transcription factors. With the rationale that binding sites conserved across different species are more likely to be functional, the standard approach is to employ cross-species comparisons and focus the search to conserved regions. Usually, computational methods that annotate conserved binding sites perform the alignment and binding site annotation steps separately and combine the results in the end. If the binding site descriptions are weak or the sequence similarity is low, the local gap structure of the alignment poses a problem in detecting the conserved sites. In this thesis, I introduce a novel method that integrates the two axes of sequence conservation and binding site annotation in a simultaneous approach yielding {$\backslash$}emph\{annotated alignments\} -- pairwise alignments with parts annotated as putative conserved transcription factor binding sites. Standard pairwise alignments are extended to include additional states for binding site profiles. A statistical framework that estimates profile-related parameters based on desired type I and type II errors is prescribed. This forms the core of the tool \{{$\backslash$}bf\{SimAnn\}\}. As an extension, I use existing probabilistic models to demonstrate how the framework can be adapted to consider position-specific evolutionary characteristics of binding sites during parameter estimation. This underlies the tool \{{$\backslash$}bf\{eSimAnn\}\}. Through simulations and real data analysis, I study the influence of considering a simultaneous approach as opposed to a multi-step one on resulting predictions. The former enables a local rearrangement in the alignment structure to bring forth perfectly aligned binding sites. This precludes the necessity of adopting post-processing steps to handle errors in pre-computed alignments, as is usually done in multi-step approaches. Additionally, the framework for parameter estimation is applicable to any novel profile of interest. Especially for instances with poor sequence conservation or profile quality, the simultaneous approach stands out. As a by-product of the analysis, I also present a formulation of the annotated alignment problem as an extended pair Hidden Markov Model and illustrate the correspondence between the various theoretical concepts.},
 author = {Abha Singh Bais},
 month = {August},
 school = {Freie Universit{\"a}t Berlin},
 title = {Annotated Alignments},
 url = {http://publications.imp.fu-berlin.de/2851/},
 year = {2008}
}

M. Bauer, G. W. Klau, K. Reinert, “An Exact Mathematical Programming Approach to Multiple RNA Sequence-Structure Alignment.”, 2008.

cite this publication

@article{fu_mi_publications331,
 author = {M. Bauer and G. W. Klau and K. Reinert},
 journal = {Algorithmic Operations Research},
 note = {to appear},
 title = {An Exact Mathematical Programming Approach to Multiple RNA Sequence-Structure       Alignment.},
 url = {http://publications.imp.fu-berlin.de/331/},
 year = {2008}
}

M. Bodirsky, C. Gröpl, M. Kang, “Generating unlabeled connected cubic planar graphs uniformly at random”, vol. 32, iss. 2, 2008.

cite this publication

@article{fu_mi_publications338,
 abstract = {We present an expected polynomial time algorithm to generate an unlabeled   connected cubic planar graph uniformly at random. We first consider
rooted connected cubic planar graphs, i.e., we count connected cubic
planar graphs up to isomorphisms that fix a certain directed edge.
Based on decompositions along the connectivity structure, we derive
recurrence formulas for the exact number of rooted cubic planar graphs.
This leads to rooted 3-connected cubic planar graphs, which have
a unique embedding on the sphere. Special care has to be taken for
rooted graphs that have a sense-reversing automorphism. Therefore
we introduce the concept of colored networks, which stand in bijective
correspondence to rooted 3-connected cubic planar graphs with given
symmetries. Colored networks can again be decomposed along the connectivity
structure. For rooted 3-connected cubic planar graphs embedded in
the plane, we switch to the dual and count rooted triangulations.
Since all these numbers can be evaluated in polynomial time using
dynamic programming, rooted connected cubic planar graphs can be
generated uniformly at random in polynomial time by inverting the
decomposition along the connectivity structure. To generate connected
cubic planar graphs without a root uniformly at random, we apply
rejection sampling and obtain an expected polynomial time algorithm},
 author = {M. Bodirsky and C. Gr{\"o}pl and M. Kang},
 journal = {Random Structures and Algorithms},
 number = {2},
 pages = {157--180},
 title = {Generating unlabeled connected cubic planar graphs uniformly at random},
 url = {http://publications.imp.fu-berlin.de/338/},
 volume = {32},
 year = {2008}
}

E. Lange, R. Tautenhahn, S. Neumann, C. Gröpl, “Critical assessment of alignment procedures for LC-MS proteomics and metabolomic measurements”, vol. 9, iss. 375, 2008.

cite this publication

@article{fu_mi_publications379,
 author = {E. Lange and R. Tautenhahn and S. Neumann and C. Gr{\"o}pl},
 journal = {BMC Bioinformatics},
 number = {375},
 title = {Critical assessment of alignment procedures for LC-MS proteomics    and metabolomic measurements},
 url = {http://publications.imp.fu-berlin.de/379/},
 volume = {9},
 year = {2008}
}

T. Rausch, A.-K. Emde, K. Reinert, “Robust consensus computation”, vol. 9, iss. Suppl 10, 2008.

cite this publication

@article{fu_mi_publications390,
 author = {T. Rausch and A.-K. Emde and K. Reinert},
 journal = {BMC Bioinformatics},
 number = {Suppl 10},
 pages = {P4},
 title = {Robust consensus computation},
 url = {http://publications.imp.fu-berlin.de/390/},
 volume = {9},
 year = {2008}
}

T. Rausch, A.-K. Emde, D. Weese, C. Notredame, K. Reinert, “Segment-based multiple sequence alignment”, vol. 24, iss. 16, 2008.

cite this publication

@article{fu_mi_publications391,
 abstract = {Motivation: Many multiple sequence alignment tools have been developed      in the past, progressing either in speed or alignment accuracy. Given
the importance and wide-spread use of alignment tools, progress in
both categories is a contribution to the community and has driven
research in the field so far. Results: We introduce a graph-based
extension to the consistency-based, progressive alignment strategy.
We apply the consistency notion to segments instead of single characters.
The main problem we solve in this context is to define segments of
the sequences in such a way that a graph-based alignment is possible.
We implemented the algorithm using the SeqAn library and report results
on amino acid and DNA sequences. The benefit of our approach is threefold:
(1) sequences with conserved blocks can be rapidly aligned, (2) the
implementation is conceptually easy, generic and fast and (3) the
consistency idea can be extended to align multiple genomic sequences.
Availability: The segment-based multiple sequence alignment tool
can be downloaded from http://www.seqan.de/projects/msa.html. A novel
version of T-Coffee interfaced with the tool is available from http://www.tcoffee.org.
The usage of the tool is described in both documentations. Contact:
rausch@inf.fu-berlin.de},
 author = {T. Rausch and A.-K. Emde and D. Weese and C. Notredame and K. Reinert},
 journal = {Bioinformatics},
 number = {16},
 pages = {i187--192},
 title = {Segment-based multiple sequence alignment},
 url = {http://publications.imp.fu-berlin.de/391/},
 volume = {24},
 year = {2008}
}

T. Rausch, A. Thomas, N. J. Camp, L. A. Facelli, “A parallel genetic algorithm to discover patterns in genetic markers that indicate predisposition to multifactorial disease”, vol. 38, 2008-07.

cite this publication

@article{fu_mi_publications394,
 abstract = {This paper describes a novel algorithm to analyze genetic linkage   data using pattern recognition techniques and genetic algorithms
(GA). The method allows a search for regions of the chromosome that
may contain genetic variations that jointly predispose individuals
for a particular disease. The method uses correlation analysis, filtering
theory and genetic algorithms to achieve this goal. Because current
genome scans use from hundreds to hundreds of thousands of markers,
two versions of the method have been implemented. The first is an
exhaustive analysis version that can be used to visualize, explore,
and analyze small genetic data sets for two marker correlations;
the second is a GA version, which uses a parallel implementation
allowing searches of higher-order correlations in large data sets.
Results on simulated data sets indicate that the method can be informative
in the identification of major disease loci and gene{\^a}??gene interactions
in genome-wide linkage data and that further exploration of these
techniques is justified. The results presented for both variants
of the method show that it can help genetic epidemiologists to identify
promising combinations of genetic factors that might predispose to
complex disorders. In particular, the correlation analysis of IBD
expression patterns might hint to possible gene{\^a}??gene interactions
and the filtering might be a fruitful approach to distinguish true
correlation signals from noise.},
 author = {T. Rausch and A. Thomas and N. J. Camp and L. A. Facelli},
 journal = {Comput. Biol. Med.},
 month = {July},
 pages = {826--836},
 title = {A parallel genetic algorithm to discover patterns in genetic markers        that indicate predisposition to multifactorial disease},
 url = {http://publications.imp.fu-berlin.de/394/},
 volume = {38},
 year = {2008}
}

M. H. Schulz, D. Weese, T. Rausch, A. Döring, K. Reinert, M. Vingron, “Fast and Adaptive Variable Order Markov Chain Construction”, 2008.

cite this publication

@inproceedings{fu_mi_publications401,
 abstract = { Variable order Markov chains (VOMCs) are a flexible class of models        that extend the well-known Markov chains. They have been applied
to a variety of problems in computational biology, e.g. protein family
classification. A linear time and space construction algorithm has
been published in 2000 by Apostolico and Bejerano. However, neither
a report of the actual running time nor an implementation of it have
ever been published since. Using their theoretical results, we implement
general and problem oriented algorithms considering recent advances
in string matching. We introduce a new software which is orders of
magnitudes faster than current tools for building VOMCs, and is suitable
for large scale analysis. Along the way we show that the lazy suffix
tree algorithm by Giegerich and others can compete with state-of-the-art
suffix array methods in terms of time and space under the type of
constraints we have analyzed in this work. },
 author = {M. H. Schulz and D. Weese and T. Rausch and A. D{\"o}ring and K. Reinert and M. Vingron},
 booktitle = {Proceedings of the 8th International Workshop in Algorithms in Bioinformatics       (WABI'08)},
 pages = {306--317},
 publisher = {Springer Verlag},
 title = {Fast and Adaptive Variable Order Markov Chain Construction},
 url = {http://publications.imp.fu-berlin.de/401/},
 year = {2008}
}

O. Schulz-Trieglaff, R. Hussong, C. Gröpl, A. Hildebrandt, A. Hildebrandt, Ch. Huber, K. Reinert, “Computational Quantification of Peptides from LC-MS data”, vol. 15, iss. 7, 2008.

cite this publication

@article{fu_mi_publications406,
 abstract = {Liquid chromatography coupled to mass spectrometry (LC-MS) has become       a major tool for the study of biological processes. High-throughput
LC-MS experimentsare frequently conducted in modern laboratories,
generating an enormous amountof data per day. A manual inspection
is therefore no longer a feasible task. Consequently, there is a
need for computational tools that can rapidly provide informationabout
mass, elution time, and abundance of the compounds in a LC-MS sample.
Wepresent an algorithm for the detection and quantification of peptides
in LC-MS data. Our approach is flexible and independent of the MS
technology in use. It is basedon a combination of the sweep line
paradigm with a novel wavelet function tailoredto detect isotopic
patterns of peptides. We propose a simple voting schema to usethe
redundant information in consecutive scans for an accurate determination
ofmonoisotopic masses and charge states. By explicitly modeling the
instrument inaccuracy, we are also able to cope with data sets of
different quality and resolution.We evaluate our technique on data
from different instruments and show that we canrapidly estimate mass,
centroid of retention time and abundance of peptides in a sound algorithmic
framework. Finally, we compare the performance of our method to several
other techniques on three data sets of varying complexity.},
 author = {O. Schulz-Trieglaff and R. Hussong and C. Gr{\"o}pl and A. Hildebrandt and A. Hildebrandt and Ch. Huber and K. Reinert},
 journal = {Journal of Computational Biology},
 keywords = {computational mass spectrometry, liquid chromatography - massspectrometry,  quantification, wavelets},
 number = {7},
 pages = {685--704},
 title = {Computational Quantification of Peptides from LC-MS data},
 url = {http://publications.imp.fu-berlin.de/406/},
 volume = {15},
 year = {2008}
}

O. Schulz-Trieglaff, N. Pfeifer, C. Gröpl, O. Kohlbacher, K. Reinert, “LC-MSsim - a simulation software for liquid chromatography mass spectrometry data”, vol. 9, iss. 423, 2008.

cite this publication

@article{fu_mi_publications408,
 abstract = {BACKGROUND: Mass Spectrometry coupled to Liquid Chromatography (LC-MS)      is commonly used to analyze the protein content of biological samples
in large scale studies. The data resulting from an LC-MS experiment
is huge, highly complex and noisy. Accordingly, it has sparked new
developments in Bioinformatics, especially in the fields of algorithm
development, statistics and software engineering. In a quantitative
label-free mass spectrometry experiment, crucial steps are the detection
of peptide features in the mass spectra and the alignment of samples
by correcting for shifts in retention time. At the moment, it is
difficult to compare the plethora of algorithms for these tasks.
So far, curated benchmark data exists only for peptide identification
algorithms but no data that represents a ground truth for the evaluation
of feature detection, alignment and filtering algorithms. RESULTS:
We present LC-MSsim, a simulation software for LC-ESI-MS experiments.
It simulates ESI spectra on the MS level. It reads a list of proteins
from a FASTA file and digests the protein mixture using a user-defined
enzyme. The software creates an LC-MS data set using a predictor
for the retention time of the peptides and a model for peak shapes
and elution profiles of the mass spectral peaks. Our software also
offers the possibility to add contaminants, to change the background
noise level and includes a model for the detectability of peptides
in mass spectra. After the simulation, LC-MSsim writes the simulated
data to public XML formats (mzXML or mzData). The software also stores
the positions (monoisotopic m/z and retention time) and ion counts
of the simulated ions in separate files. CONCLUSIONS: LC-MSsim generates
simulated LC-MS data sets and incorporates models for peak shapes
and contaminations. Algorithm developers can match the results of
feature detection and alignment algorithms against the simulated
ion lists and meaningful error rates can be computed. We anticipate
that LC-MSsim will be useful to the wider community to perform benchmark
studies and comparisons between computational tools.},
 author = {O. Schulz-Trieglaff and N. Pfeifer and C. Gr{\"o}pl and O. Kohlbacher and K. Reinert},
 journal = {BMC Bioinformatics},
 keywords = {algorithm, benchmark, lc-ms-ms, massspec, metabolomics, proteomics},
 number = {423},
 title = {LC-MSsim - a simulation software for liquid chromatography mass     spectrometry data},
 url = {http://publications.imp.fu-berlin.de/408/},
 volume = {9},
 year = {2008}
}

M. Sturm, B. Andreas, C. Gröpl, R. Hussong, E. Lange, N. Pfeifer, O. Schulz-Trieglaff, A. Zerck, K. Reinert, O. Kohlbacher, “OpenMS - An open-source software framework for mass spectrometry”, vol. 9, iss. 163, 2008.

cite this publication

@article{fu_mi_publications409,
 abstract = {BACKGROUND: Mass spectrometry is an essential analytical technique  for high-throughput analysis in proteomics and metabolomics. The
development of new separation techniques, precise mass analyzers
and experimental protocols is a very active field of research. This
leads to more complex experimental setups yielding ever increasing
amounts of data. Consequently, analysis of the data is currently
often the bottleneck for experimental studies. Although software
tools for many data analysis tasks are available today, they are
often hard to combine with each other or not flexible enough to allow
for rapid prototyping of a new analysis workflow. RESULTS: We present
OpenMS, a software framework for rapid application development in
mass spectrometry. OpenMS has been designed to be portable, easy-to-use
and robust while offering a rich functionality ranging from basic
data structures to sophisticated algorithms for data analysis. This
has already been demonstrated in several studies. CONCLUSIONS: OpenMS
is available under the Lesser GNU Public License (LGPL) from the
project website at http://www.openms.de.},
 author = {M. Sturm and B. Andreas and C. Gr{\"o}pl and R. Hussong and E. Lange and N. Pfeifer and O. Schulz-Trieglaff and A. Zerck and K. Reinert and O. Kohlbacher},
 journal = {BMC Bioinformatics},
 keywords = {lc-ms-ms, massspec, proteomics},
 number = {163},
 title = {OpenMS - An open-source software framework for mass spectrometry},
 url = {http://publications.imp.fu-berlin.de/409/},
 volume = {9},
 year = {2008}
}

D. Weese, M. H. Schulz, P. Perner, “Efficient String Mining under Constraints via the Deferred Frequency Index”, 2008-07.

cite this publication

@inproceedings{fu_mi_publications412,
 abstract = { We propose a general approach for frequency based string mining,   which has many applications, e.g. in contrast data mining. Our contribution
is a novel algorithm based on a deferred data structure. Despite
its simplicity, our approach is up to 4 times faster and uses about
half the memory compared to the best-known algorithm of Fischer et
al. Applications in various string domains, e.g. natural language,
DNA or protein sequences, demonstrate the improvement of our algorithm.},
 author = {D. Weese and M. H. Schulz},
 booktitle = {Proceedings of the 8th Industrial Conference on Data Mining (ICDM'08)},
 editor = {P. Perner},
 month = {July},
 pages = {374--388},
 publisher = {Springer Verlag},
 title = {Efficient String Mining under Constraints via the Deferred Frequency        Index},
 url = {http://publications.imp.fu-berlin.de/412/},
 year = {2008}
}

A. Döring, D. Weese, T. Rausch, K. Reinert, “SeqAn -- An efficient, generic C++ library for sequence analysis”, vol. 9, iss. 1, 2008.

cite this publication

@article{fu_mi_publications455,
 abstract = {BACKGROUND:The use of novel algorithmic techniques is pivotal to many important problems in life science. For example the sequencing of the human genome [1] would not have been possible without advanced assembly algorithms. However, owing to the high speed of technological progress and the urgent need for bioinformatics tools, there is a widening gap between state-of-the-art algorithmic techniques and the actual algorithmic components of tools that are in widespread use.RESULTS:To remedy this trend we propose the use of SeqAn, a library of efficient data types and algorithms for sequence analysis in computational biology. SeqAn comprises implementations of existing, practical state-of-the-art algorithmic components to provide a sound basis for algorithm testing and development. In this paper we describe the design and content of SeqAn and demonstrate its use by giving two examples. In the first example we show an application of SeqAn as an experimental platform by comparing different exact string matching algorithms. The second example is a simple version of the well-known MUMmer tool rewritten in SeqAn. Results indicate that our implementation is very efficient and versatile to use.CONCLUSION:We anticipate that SeqAn greatly simplifies the rapid development of new bioinformatics tools by providing a collection of readily usable, well-designed algorithmic components which are fundamental for the field of sequence analysis. This leverages not only the implementation of new algorithms, but also enables a sound analysis and comparison of existing algorithms.},
 author = {A. D{\"o}ring and D. Weese and T. Rausch and K. Reinert},
 journal = {BMC Bioinformatics},
 number = {1},
 pages = {11},
 title = {SeqAn -- An efficient, generic C++ library for sequence analysis},
 url = {http://publications.imp.fu-berlin.de/455/},
 volume = {9},
 year = {2008}
}

2007

E. Lange, C. Gröpl, O. Schulz-Trieglaff, A. Leinenbach, Ch. Huber, K. Reinert, “A Geometric Approach for the Alignment of Liquid Chromatography-Mass Spectrometry Data”, vol. 23, iss. 13, 2007.

cite this publication

@article{fu_mi_publications1133,
 abstract = {Motivation: Liquid chromatography coupled to mass spectrometry (LC-MS) and combined with tandem mass spectrometry (LC-MS/MS) have become a prominent tool for the analysis of complex proteomic samples. An important step in a typical workflow is the combination of results from multiple LC-MS experiments to improve confidence in the obtained measurements or to compare results from different samples. To do so, a suitable mapping or alignment between the data sets needs to be estimated. The alignment has to correct for variations in mass and elution time which are present in all mass spectrometry experiments. 

Results: We propose a novel algorithm to align LC-MS samples and to match corresponding ion species across samples. Our algorithm matches landmark signals between two data sets using a geometric technique based on pose clustering. Variations in mass and retention time are corrected by an affine dewarping function estimated from matched landmarks. We use the pairwise dewarping in an algorithm for aligning multiple samples. We show that our pose clustering approach is fast and reliable as compared to previous approaches. It is robust in the presence of noise and able to accurately align samples with only few common ion species. In addition, we can easily handle different kinds of LC-MS data and adopt our algorithm to new mass spectrometry technologies. 

Availability: This algorithm is implemented as part of the OpenMS software library for shotgun proteomics and available under the Lesser GNU Public License (LGPL) at www.openms.de},
 author = {E. Lange and C. Gr{\"o}pl and O. Schulz-Trieglaff and A. Leinenbach and Ch. Huber and K. Reinert},
 booktitle = {Proceedings of the 15th Annual International Conference on Intelligent      Systems for Molecular Biology (ISMB) \&amp; 6th European Conference on  Computational Biology (ECCB)},
 journal = {Oxford Journals},
 number = {13},
 pages = {i273--i281},
 publisher = {Oxford University Press},
 title = {A Geometric Approach for the Alignment of Liquid Chromatography-Mass        Spectrometry Data},
 url = {http://publications.imp.fu-berlin.de/1133/},
 volume = {23},
 year = {2007}
}

Witold Eryk Wolski, “Analysis of sets and collections of Peptide Mass Fingerprint data”, 2007-09-18.

cite this publication

@phdthesis{fu_mi_publications2536,
 abstract = {Recent advances in genomics, which outstanding achievements were exemplified by the complete sequencing of the human genome provided the infrastructure and information enabling the development of several proteomic technologies. Currently no single proteomic analysis strategy can sufficiently address the question of how the proteome is organised in terms of numerical complexity and complexity generated by the protein-protein interactions forming supramolecular complexes within the cell.

In order to bring a detailed structural/functional picture of these complexes in whole genomes, cells, organelles or in normal and pathological states several proteomic strategies can be utilised. Combination of technologies will bring a more detailed answer to what are the components of certain cellular pathways (e.g.: targets of kinases/phosphatases, cytoskeletal proteins, signalling molecules), how do they interconnect, how are they modified in the cell and what are the roles of several complex components in normal and disease conditions.

These types of studies depend on fast and high throughput methods of protein identification. One of the most common methods of analysis is mass spectrometric technique called peptide mapping. Peptide mapping is the comparison of mass spectrometrically determined peptide masses of a sequence specific digest of a single protein or peptide of interest with peptide masses predicted from genomic databases. In this work several contributions to the computational analysis of mass spectrometric data are presented. During the course of my studies I looked at the distribution of peptide masses in sequence specific protein sequence digests and developed a simple mathematical model dealing with peptide mass cluster centre location. I have introduced and studied the methods of calibration of mass spectrometric peak-list without resorting to internal or external calibration samples. Of importance is also contribution of this work to the calibration of data produced in high throughput experiments. In addition, I studied how filtering of non-peptide peaks influences the identification rates in mass spectrometric instruments. Furthermore, I focused my studies on measures of spectra similarity which can be used to acquire supplementary information, increasing the sensitivity and specificity of database searches.},
 author = {Witold Eryk Wolski},
 month = {September},
 school = {Freie Universit{\"a}t Berlin},
 title = {Analysis of sets and collections of Peptide Mass Fingerprint data},
 url = {http://publications.imp.fu-berlin.de/2536/},
 year = {2007}
}

Wasinee Rungsarityotin, “Algorithm to identify protein complexes from high-throughput data”, 2007-12-03.

cite this publication

@phdthesis{fu_mi_publications2539,
 abstract = {Recent advances in proteomic technologies such as two-hybrid and biochemical purification allow large-scale investigations of protein interactions. The goal of this thesis is to investigate model-based approaches to predict protein complexes from tandem affinity purification experiments. We compare a simple overlapping model to a partitioning model. In addition, we propose a visualization framework to delineate overlapping complexes from experimental data. We propose two models to predict protein complexes from experimental data. Our first model is in some sense the simplest possible one. It is based on frequent itemset mining, which merely counts the incidence of certain sets of proteins within the experimental results. The affinity of two sets of proteins to form clusters is modeled to be independent, regardless of any overlapping members between these sets. Our second model assumes that formation of protein complexes can be reduced to pairwise interactions between proteins. Interactions between proteins are more likely for pairs of proteins if they come from the same cluster. Based on this model, we use Markov Random Field theory to calculate a maximum-likelihood assignment of proteins to clusters.},
 author = {Wasinee Rungsarityotin},
 month = {December},
 school = {Freie Universit{\"a}t Berlin},
 title = {Algorithm to identify protein complexes from high-throughput data},
 url = {http://publications.imp.fu-berlin.de/2539/},
 year = {2007}
}

M. Bauer, G. W. Klau, K. Reinert, “Accurate multiple sequence-structure alignment of RNA sequences using combinatorial optimization.”, vol. 8, iss. 1, 2007-07.

cite this publication

@article{fu_mi_publications332,
 author = {M. Bauer and G. W. Klau and K. Reinert},
 journal = {BMC Bioinformatics},
 month = {July},
 number = {1},
 pages = {271},
 title = {Accurate multiple sequence-structure alignment of RNA sequences using       combinatorial optimization.},
 url = {http://publications.imp.fu-berlin.de/332/},
 volume = {8},
 year = {2007}
}

M. Bodirsky, C. Gröpl, D. Johannsen, M. Kang, “A direct decomposition of 3-connected planar graphs”, vol. B54Ak, 2007.

cite this publication

@article{fu_mi_publications336,
 address = {Taormina},
 author = {M. Bodirsky and C. Gr{\"o}pl and D. Johannsen and M. Kang},
 journal = {S{\'e}minaire Lotharingien de Combinatoire},
 keywords = {Three-Connected Planar Graph, Cnet, Planar Graph, Enumeration, Random       graphs, Random Generation, Dynamic Programming, Graph Theory},
 pages = {15 pages},
 title = {A direct decomposition of 3-connected planar graphs},
 url = {http://publications.imp.fu-berlin.de/336/},
 volume = {B54Ak},
 year = {2007}
}

M. Bodirsky, C. Gröpl, M. Kang, “Generating labeled planar graphs uniformly at random”, vol. 379, iss. 3, 2007.

cite this publication

@article{fu_mi_publications339,
 abstract = {We present a deterministic polynomial time algorithm to sample a labeled    planar graph uniformly at random. Our approach uses recursive formulae
for the exact number of labeled planar graphs with n vertices and
m edges, based on a decomposition into 1-, 2-, and 3-connected components.
We can then use known sampling algorithms and counting formulae for
3-connected planar graphs.},
 author = {M. Bodirsky and C. Gr{\"o}pl and M. Kang},
 journal = {Theoretical Computer Science},
 keywords = {Labeled planar graph, Enumeration, Decomposition, Sampling algorithm,       Dynamic programming, Graph theory},
 number = {3},
 pages = {377--386},
 title = {Generating labeled planar graphs uniformly at random},
 url = {http://publications.imp.fu-berlin.de/339/},
 volume = {379},
 year = {2007}
}

D. Fasulo, A.-K. Emde, L.-Y. Wang, N. J. Edwards, “Alignment of Mass Spectrometry Data by Clique Finding and Optimization”, vol. 4532, 2007.

cite this publication

@inproceedings{fu_mi_publications345,
 abstract = {Mass spectrometry (MS) is becoming a popular approach for quantifying       the protein composition of complex samples. A great challenge for
comparative proteomic profiling is to match corresponding peptide
features from different experiments to ensure that the same protein
intensities are correctly identified. Multi-dimensional data acquisition
from liquid-chromatography mass spectrometry (LC-MS) makes the alignment
problem harder. We propose a general paradigm for aligning peptide
features using a bounded error model. Our method is tolerant of imperfect
measurements, missing peaks, and extraneous peaks. It can handle
an arbitrary number of dimensions of separation, and is very fast
in practice even for large data sets. Finally, its parameters are
intuitive and we describe a heuristic for estimating them automatically.
We demonstrate results on single- and multi-dimensional data.},
 author = {D. Fasulo and A.-K. Emde and L.-Y. Wang and N. J. Edwards},
 booktitle = {Systems Biology and Computational Proteomics},
 pages = {119--129},
 publisher = {Springer Verlag},
 series = {Lecture Notes in Computer Science},
 title = {Alignment of Mass Spectrometry Data by Clique Finding and Optimization},
 url = {http://publications.imp.fu-berlin.de/345/},
 volume = {4532},
 year = {2007}
}

G. W. Klau, S. Rahmann, A. Schliep, M. Vingron, K. Reinert, “Integer Linear Programming Approaches for Non-unique Probe Selection”, vol. 155, 2007.

cite this publication

@article{fu_mi_publications371,
 author = {G. W. Klau and S. Rahmann and A. Schliep and M. Vingron and K. Reinert},
 journal = {Discrete Applied Mathematics},
 pages = {840--856},
 title = {Integer Linear Programming Approaches for Non-unique Probe Selection},
 url = {http://publications.imp.fu-berlin.de/371/},
 volume = {155},
 year = {2007}
}

P. May, G. W. Klau, M. Bauer, T. Steinke, “Accelerated microRNA Precursor Detection Using the Smith-Waterman Algorithm on FPGAs”, vol. 4360, 2007.

cite this publication

@inproceedings{fu_mi_publications384,
 author = {P. May and G. W. Klau and M. Bauer and T. Steinke},
 booktitle = {Proc. of GCCB 2006},
 pages = {19--32},
 series = {LNBI},
 title = {Accelerated microRNA Precursor Detection Using the Smith-Waterman   Algorithm on FPGAs},
 url = {http://publications.imp.fu-berlin.de/384/},
 volume = {4360},
 year = {2007}
}

Ch. Meier, “Bioinformatik: Aktuelle Proteinausstattung erlaubt Aussagen über den Gesundheitszustand. Software hilft Ärzten künftig bei der Diagnose.”, 2007-06.

cite this publication

@misc{fu_mi_publications386,
 author = {Ch. Meier},
 month = {June},
 note = {Der Artikel eines freien Medienjournalisten beschreibt den Einsatz  von OpenMS in der Proteomik},
 title = {Bioinformatik: Aktuelle Proteinausstattung erlaubt Aussagen {\"u}ber den Gesundheitszustand. Software hilft {\"A}rzten k{\"u}nftig
bei der Diagnose.},
 url = {http://publications.imp.fu-berlin.de/386/},
 year = {2007}
}

K. Reinert, M. Bauer, A. Döring, A. L. Halpern, “A general paradigm for fast and adaptive clustering of biological sequences”, 2007.

cite this publication

@inproceedings{fu_mi_publications395,
 author = {K. Reinert and M. Bauer and A. D{\"o}ring and A. L. Halpern},
 booktitle = {German Conference on Bioinformatics (GCB 2007)},
 pages = {15--29},
 title = {A general paradigm for fast and adaptive clustering of biological   sequences},
 url = {http://publications.imp.fu-berlin.de/395/},
 year = {2007}
}

K. Reinert, D. H. Huson, “Sequence Assembly”, vol. 1, 2007-12.

cite this publication

@incollection{fu_mi_publications396,
 address = {Weinheim},
 author = {K. Reinert and D. H. Huson},
 month = {December},
 pages = {25--55},
 publisher = {Wiley-VCH},
 series = {Bioinformatics - From Genomes to Therapies},
 title = {Sequence Assembly},
 url = {http://publications.imp.fu-berlin.de/396/},
 volume = {1},
 year = {2007}
}

O. Schulz-Trieglaff, R. Hussong, C. Gröpl, A. Hildebrandt, K. Reinert, “A Fast and Accurate Algorithm for the Quantification of Peptides from Mass Spectrometry data”, 2007.

cite this publication

@inproceedings{fu_mi_publications407,
 author = {O. Schulz-Trieglaff and R. Hussong and C. Gr{\"o}pl and A. Hildebrandt and K. Reinert},
 booktitle = {Proceedings of the Eleventh Annual International Conference on Research     in Computational Molecular Biology (RECOMB 2007)},
 pages = {473--487},
 title = {A Fast and Accurate Algorithm for the Quantification of Peptides    from Mass Spectrometry data},
 url = {http://publications.imp.fu-berlin.de/407/},
 year = {2007}
}

2006

E. Althaus, A. Caprara, H.-P. Lenhof, K. Reinert, “A Branch-and-Cut Algorithm for Multiple Sequence Alignment”, iss. 105, 2006.

cite this publication

@article{fu_mi_publications325,
 author = {E. Althaus and A. Caprara and H.-P. Lenhof and K. Reinert},
 journal = {Mathematical Programming},
 number = {105},
 pages = {387--425},
 title = {A Branch-and-Cut Algorithm for Multiple Sequence Alignment},
 url = {http://publications.imp.fu-berlin.de/325/},
 year = {2006}
}

O. Kohlbacher, K. Reinert, C. Gröpl, E. Lange, N. Pfeiffer, O. Schulz-Trieglaff, M. Sturm, “TOPP - The OpenMS Proteomics Pipeline”, 2006.

cite this publication

@inproceedings{fu_mi_publications375,
 abstract = {Motivation: Experimental techniques in proteomics have seen rapid   development over the last few years. Volume and complexity of the
data have both been growing at a similar rate. Accordingly, data
management and analysis are one of the major challenges in proteomics.
Flexible algorithms are required to handle changing experimental
setups and to assist in developing and validating new methods. In
order to facilitate these studies, it would be desirable to have
a flexible `toolbox' of versatile and user-friendly applications
allowing for rapid construction of computational workflows in proteomics.
Results: We describe a set of tools for proteomics data analysis
-- TOPP, The OpenMS Proteomics Pipeline. TOPP provides a set of computational
tools which can be easily combined into analysis pipelines even by
non-experts and can be used in proteomics workflows. These applications
range from useful utilities (file format conversion, peak picking)
over wrapper applications for known applications (e.g. Mascot) to
completely new algorithmic techniques for data reduction and data
analysis. We anticipate that TOPP will greatly facilitate rapid prototyping
of proteomics data evaluation pipelines. As such, we describe the
basic concepts and the current abilities of TOPP and illustrate these
concepts in the context of two example applications: the identification
of peptides from a raw data set through database search and the complex
analysis of a standard addition experiment for the absolute quantitation
of biomarkers. The latter example demonstrates TOPP's ability to
construct flexible analysis pipelines in support of complex experimental
setups. Availability: The TOPP components are available as open-source
software under the lesser GNU public license (LGPL). Source code
is available from the project web site at www.OpenMS.de},
 author = {O. Kohlbacher and K. Reinert and C. Gr{\"o}pl and E. Lange and N. Pfeiffer and O. Schulz-Trieglaff and M. Sturm},
 booktitle = {Proceedings of the 5th European Conference on Computational Biology (ECCB 2006)},
 title = {TOPP - The OpenMS Proteomics Pipeline},
 url = {http://publications.imp.fu-berlin.de/375/},
 year = {2006}
}

E. Lange, C. Gröpl, K. Reinert, A. Hildebrandt, “High Accuracy Peak-Picking of Proteomics Data using Wavelet Techniques”, 2006.

cite this publication

@inproceedings{fu_mi_publications376,
 author = {E. Lange and C. Gr{\"o}pl and K. Reinert and A. Hildebrandt},
 booktitle = {Proceedings of the 11th Pacific Symposium on Biocomputing (PSB-06)},
 pages = {243--254},
 title = {High Accuracy Peak-Picking of Proteomics Data using Wavelet Techniques},
 url = {http://publications.imp.fu-berlin.de/376/},
 year = {2006}
}

B. M. Mayr, O. Kohlbacher, K. Reinert, C. Gröpl, E. Lange, C. L. Klein, Ch. Huber, “Absolute Myoglobin Quantitation in Serum by Combining Two-Dimensional Liquid Chromatography-Electrospray Ionization Mass Spectrometry and Novel Data Analysis Algorithms”, vol. 5, 2006.

cite this publication

@article{fu_mi_publications385,
 author = {B. M. Mayr and O. Kohlbacher and K. Reinert and C. Gr{\"o}pl and E. Lange and C. L. Klein and Ch. Huber},
 journal = {Journal of Proteome Research},
 pages = {414--421},
 title = {Absolute Myoglobin Quantitation in Serum by Combining Two-Dimensional       Liquid Chromatography-Electrospray Ionization Mass Spectrometry and
Novel Data Analysis Algorithms},
 url = {http://publications.imp.fu-berlin.de/385/},
 volume = {5},
 year = {2006}
}

T. Rausch, “Discovering causes of multifactorial diseases”, 2006-09.

cite this publication

@unpublished{fu_mi_publications389,
 author = {T. Rausch},
 month = {September},
 school = {Hasso-Plattner-Institut f{\"u}r Softwaresystemtechnik GmbH, Universit{\"a}t Potsdam},
 title = {Discovering causes of multifactorial diseases},
 url = {http://publications.imp.fu-berlin.de/389/},
 year = {2006}
}

K. Reinert, O. Kohlbacher, C. Gröpl, E. Lange, O. Schulz-Trieglaff, M. Sturm, N. Pfeifer, C. G. Huber, O. Kohlbacher, K. Reinert, “OpenMS - A Framework for Quantitative HPLC/MS-Based Proteomics”, iss. 5471, 2006.

cite this publication

@inproceedings{fu_mi_publications397,
 author = {K. Reinert and O. Kohlbacher and C. Gr{\"o}pl and E. Lange and O. Schulz-Trieglaff and M. Sturm and N. Pfeifer},
 booktitle = {Computational Proteomics},
 editor = {C. G. Huber and O. Kohlbacher and K. Reinert},
 note = {{\ensuremath{<}}span class='mathrm'{\ensuremath{>}}\&lt;{\ensuremath{<}}/span{\ensuremath{>}}http://drops.dagstuhl.de/opus/volltexte/2006/546{\ensuremath{<}}span class='mathrm'{\ensuremath{>}}\&gt;{\ensuremath{<}}/span{\ensuremath{>}} [date of citation:       2006-01-01]},
 number = {05471},
 publisher = {Internationales Begegnungs- und Forschungszentrum f{\~A}?r Informatik       (IBFI), Schloss Dagstuhl, Germany},
 series = {Dagstuhl Seminar Proceedings},
 title = {OpenMS - A Framework for Quantitative HPLC/MS-Based Proteomics},
 url = {http://publications.imp.fu-berlin.de/397/},
 year = {2006}
}

W. E. Wolski, M. Farrow, A.-K. Emde, L. Maciej, H. Lehrach, K. Reinert, “Analytical model of peptide mass cluster centres with applications”, vol. 4, iss. 18, 2006.

cite this publication

@article{fu_mi_publications415,
 author = {W. E. Wolski and M. Farrow and A.-K. Emde and L. Maciej and H. Lehrach and K. Reinert},
 journal = {Proteome Science},
 number = {18},
 pages = {doi:10.1186/1477--5956},
 title = {Analytical model of peptide mass cluster centres with applications},
 url = {http://publications.imp.fu-berlin.de/415/},
 volume = {4},
 year = {2006}
}

D. Weese, “Entwurf und Implementierung eines generischen Substring-Index”, p. 99, 2006-05-02.

cite this publication

@phdthesis{fu_mi_publications457,
 author = {D. Weese},
 month = {May},
 school = {Humboldt-University},
 title = {Entwurf und Implementierung eines generischen Substring-Index},
 url = {http://publications.imp.fu-berlin.de/457/},
 year = {2006}
}

2005

V. Bafna, K. Reinert, “Mass Spectrometry and Computational Proteomics”, 2005.

cite this publication

@incollection{fu_mi_publications327,
 author = {V. Bafna and K. Reinert},
 booktitle = {Encyclopedia of Genetics, Genomics, Proteomics and Bioinformatics},
 publisher = {Wiley-Eastern},
 title = {Mass Spectrometry and Computational Proteomics},
 url = {http://publications.imp.fu-berlin.de/327/},
 year = {2005}
}

M. Bauer, G. W. Klau, K. Reinert, “Multiple Structural RNA Alignment with Lagrangian Relaxation”, 2005.

cite this publication

@inproceedings{fu_mi_publications333,
 author = {M. Bauer and G. W. Klau and K. Reinert},
 booktitle = {Proceedings of the 5th Workshop on Algorithms Bioinformatics (WABI-05)},
 pages = {303--314},
 title = {Multiple Structural RNA Alignment with Lagrangian Relaxation},
 url = {http://publications.imp.fu-berlin.de/333/},
 year = {2005}
}

M. Bauer, G. W. Klau, K. Reinert, “Fast and Accurate Structural RNA Alignment by Progressive Lagrangian Relaxation”, 2005.

cite this publication

@inproceedings{fu_mi_publications334,
 author = {M. Bauer and G. W. Klau and K. Reinert},
 booktitle = {Proceedings of the 1st International Symposium on Computational Life        Science (CompLife-05)},
 pages = {217--228},
 title = {Fast and Accurate Structural RNA Alignment by Progressive Lagrangian        Relaxation},
 url = {http://publications.imp.fu-berlin.de/334/},
 year = {2005}
}

M. Bodirsky, C. Gröpl, D. Johannsen, M. Kang, “A direct decomposition of 3-connected planar graphs”, 2005.

cite this publication

@inproceedings{fu_mi_publications337,
 address = {Taormina},
 author = {M. Bodirsky and C. Gr{\"o}pl and D. Johannsen and M. Kang},
 booktitle = {Proceedings of the 17th Annual International Conference on Formal   Power Series and Algebraic Combinatorics (FPSAC05)},
 keywords = {Three-Connected Planar Graph, Cnet, Planar Graph, Enumeration, Random       graphs, Random Generation, Dynamic Programming, Graph Theory},
 title = {A direct decomposition of 3-connected planar graphs},
 url = {http://publications.imp.fu-berlin.de/337/},
 year = {2005}
}

M. Bodirsky, C. Gröpl, M. Kang, “Sampling Unlabeled Biconnected Planar Graphs”, 2005.

cite this publication

@inproceedings{fu_mi_publications340,
 author = {M. Bodirsky and C. Gr{\"o}pl and M. Kang},
 booktitle = {Proceedings of the 16th Annual International Symposium on Algorithms        and Computation (ISAAC05)},
 keywords = {Planar Graph, Enumeration, Random graphs, Random Generation, Graph  Theory},
 title = {Sampling Unlabeled Biconnected Planar Graphs},
 url = {http://publications.imp.fu-berlin.de/340/},
 year = {2005}
}

C. Gröpl, “An Algorithm for Feature Finding in LC/MS Raw Data”, 2005.

cite this publication

@inproceedings{fu_mi_publications347,
 author = {C. Gr{\"o}pl},
 booktitle = {Computational Proteomics},
 note = {Extended abstract for talk given at Dagstuhl Seminar 05471 on Computational Proteomics, 20.-25. November 2005},
 publisher = {Dagstuhl Online Publication Server (DROPS)},
 title = {An Algorithm for Feature Finding in LC/MS Raw Data},
 url = {http://publications.imp.fu-berlin.de/347/},
 year = {2005}
}

C. Gröpl, A. Hildebrandt, E. Lange, S. Lövenich, M. Sturm, “OpenMS - Software for Mass Spectrometry”, 2005.

cite this publication

@misc{fu_mi_publications351,
 author = {C. Gr{\"o}pl and A. Hildebrandt and E. Lange and S. L{\"o}venich and M. Sturm},
 keywords = {mass spectrometry, proteomics, open source software},
 note = {http://mbi.osu.edu/2004/workshops2004.html},
 title = {OpenMS - Software for Mass Spectrometry},
 url = {http://publications.imp.fu-berlin.de/351/},
 year = {2005}
}

C. Gröpl, A. Hildebrandt, E. Lange, M. Sturm, “OpenMS - a generic open source framework for HPLC/MS-based proteomics”, 2005.

cite this publication

@misc{fu_mi_publications352,
 author = {C. Gr{\"o}pl and A. Hildebrandt and E. Lange and M. Sturm},
 keywords = {mass spectrometry, proteomics, open source software},
 note = {http://www.hupo2005.com/},
 title = {OpenMS - a generic open source framework for HPLC/MS-based proteomics},
 url = {http://publications.imp.fu-berlin.de/352/},
 year = {2005}
}

C. Gröpl, E. Lange, K. Reinert, M. Sturm, Ch. Huber, B. M. Mayr, C. L. Klein, “Algorithms for the automated absolute quantification of diagnostic markers in complex proteomics samples”, 2005.

cite this publication

@inproceedings{fu_mi_publications358,
 abstract = {HPLC-ESI-MS is rapidly becoming an established standard method for  shotgun proteomics. Currently, its major drawbacks are twofold: quantification
is mostly limited to relative quantification and the large amount
of data produced by every individual experiment can make manual analysis
quite difficult. Here we present a new, combined experimental and
algorithmic approach to absolutely quantify proteins from samples
with unprecedented precision. We apply the method to the analysis
of myoglobin in human blood serum, which is an important diagnostic
marker for myocardial infarction. Our approach was able to determine
the absolute amount of myoglobin in a serum sample through a series
of standard addition experiments with a relative error of 2.5 percent.
Compared to a manual analysis of the same dataset we could improve
the precision and conduct it in a fraction of the time needed for
the manual analysis. We anticipate that our automatic quantitation
method will facilitate further absolute or relative quantitation
of even more complex peptide samples. The algorithm was developed
using our publically available software framework OpenMS (www.openms.de).},
 author = {C. Gr{\"o}pl and E. Lange and K. Reinert and M. Sturm and Ch. Huber and B. M. Mayr and C. L. Klein},
 booktitle = {Proceedings of the 1st International Symposium on Computational Life        Science (CompLife05)},
 pages = {151--163},
 title = {Algorithms for the automated absolute quantification of diagnostic  markers in complex proteomics samples},
 url = {http://publications.imp.fu-berlin.de/358/},
 year = {2005}
}

C. L. Klein, O. Kohlbacher, K. Reinert, “Reference methods and materials in standardisation and quality assurance (abstract)”, vol. 272, iss. Supplement 1, 2005.

cite this publication

@article{fu_mi_publications373,
 author = {C. L. Klein and O. Kohlbacher and K. Reinert},
 journal = {FEBS Journal},
 number = {Supplement 1},
 pages = {490--504},
 title = {Reference methods and materials in standardisation and quality assurance    (abstract)},
 url = {http://publications.imp.fu-berlin.de/373/},
 volume = {272},
 year = {2005}
}

E. Lange, C. Gröpl, K. Reinert, A. Hildebrandt, “High-accuracy peak picking of proteomics data”, 2005.

cite this publication

@inproceedings{fu_mi_publications377,
 author = {E. Lange and C. Gr{\"o}pl and K. Reinert and A. Hildebrandt},
 booktitle = {Computational Proteomics},
 note = {Extended abstract for talk given at Dagstuhl Seminar 05471 on Computational Proteomics, 20.-25. November 2005},
 publisher = {Dagstuhl Online Publication Server (DROPS)},
 title = {High-accuracy peak picking of proteomics data},
 url = {http://publications.imp.fu-berlin.de/377/},
 year = {2005}
}

K. Reinert, O. Kohlbacher, C. Gröpl, E. Lange, O. Schulz-Trieglaff, M. Sturm, N. Pfeifer, “OpenMS - A Framework for Quantitative HPLC/MS-Based Proteomics”, 2005.

cite this publication

@inproceedings{fu_mi_publications398,
 author = {K. Reinert and O. Kohlbacher and C. Gr{\"o}pl and E. Lange and O. Schulz-Trieglaff and M. Sturm and N. Pfeifer},
 booktitle = {Computational Proteomics},
 note = {Extended abstract for talk given at Dagstuhl Seminar 05471 on Computational Proteomics, 20.-25. November 2005},
 publisher = {Dagstuhl Online Publication Server (DROPS)},
 title = {OpenMS - A Framework for Quantitative HPLC/MS-Based Proteomics},
 url = {http://publications.imp.fu-berlin.de/398/},
 year = {2005}
}

O. Schulz-Trieglaff, “Modelling the Randomness in Biological Systems”, 2005.

cite this publication

@misc{fu_mi_publications402,
 author = {O. Schulz-Trieglaff},
 keywords = {petri nets, Gillespie algorithm},
 title = {Modelling the Randomness in Biological Systems},
 url = {http://publications.imp.fu-berlin.de/402/},
 year = {2005}
}

O. Schulz-Trieglaff, “Software Platforms for Computational Proteomics”, 2005.

cite this publication

@inproceedings{fu_mi_publications403,
 author = {O. Schulz-Trieglaff},
 booktitle = {Computational Proteomics},
 note = {Extended abstract for talk given at Dagstuhl Seminar 05471 on Computational Proteomics, 20.-25. November 2005},
 publisher = {Dagstuhl Online Publication Server (DROPS)},
 title = {Software Platforms for Computational Proteomics},
 url = {http://publications.imp.fu-berlin.de/403/},
 year = {2005}
}

W. E. Wolski, M. Lalowski, P. Jungblut, K. Reinert, “Calibration of mass spectrometric peptide mass fingerprint data without specific external or internal calibrants”, vol. 6, iss. 203, 2005.

cite this publication

@article{fu_mi_publications413,
 author = {W. E. Wolski and M. Lalowski and P. Jungblut and K. Reinert},
 journal = {BMC Bioinformatics},
 number = {203},
 pages = {http://www.biomedcentral.com/1471--2105/6/203},
 title = {Calibration of mass spectrometric peptide mass fingerprint data without     specific external or internal calibrants},
 url = {http://publications.imp.fu-berlin.de/413/},
 volume = {6},
 year = {2005}
}

E. Wolski, M. Lalowski, P. Martus, P. Giavalisco, J. Gobom, A. Sickmann, H. Lehrach, K. Reinert, “Transformation and other factors of the peptide mass spectormetry pairwise peaklist comparison process”, vol. 6, iss. 285, 2005.

cite this publication

@article{fu_mi_publications414,
 author = {E. Wolski and M. Lalowski and P. Martus and P. Giavalisco and J. Gobom and A. Sickmann and H. Lehrach and K. Reinert},
 journal = {BMC Bioinformatics},
 number = {285},
 pages = {http://www.biomedcentral.com/1471--2105/6/285},
 title = {Transformation and other factors of the peptide mass spectormetry   pairwise peaklist comparison process},
 url = {http://publications.imp.fu-berlin.de/414/},
 volume = {6},
 year = {2005}
}

2004

M. Bauer, G. W. Klau, “Structural Alignment of Two RNA Sequences with Lagrangian Relaxation”, 2004.

cite this publication

@inproceedings{fu_mi_publications330,
 author = {M. Bauer and G. W. Klau},
 booktitle = {Proceedings of the 15th International Symposium, ISAAC 2004, Hong   Kong},
 pages = {113--125},
 publisher = {Springer Verlag},
 series = {LNCS 3341},
 title = {Structural Alignment of Two RNA Sequences with Lagrangian Relaxation},
 url = {http://publications.imp.fu-berlin.de/330/},
 year = {2004}
}

E. Lange, “Peak picking in mass spectra”, 2004.

cite this publication

@misc{fu_mi_publications344,
 abstract = {High throughput analysis of proteins using mass spectrometry requires       an efficient signal preprocessing which reduces the amount of data
with minimal loss of information. Therefore the peaks that belong
to a peptide have to be detected, and important features like their
peak centroid position, the height and the area have to be determined.
Here we present a peak picking algorithm which detects peaks in a
noisy mass spectrum using its continuous wavelet transform. By adapting
the wavelet to the theoretical peak shape, each peaks centroid position
is precisely identified; thus, overlapping peaks (e.g. caused by
electronspray ionization) are well separated. Representing each peak
not only by its centroid position and area but also by the parameters
of the best fitting asymmetric lorentzian or hyperbolic secant squared
functions, yields valuable information about the original peak shape
for further analysis. Given a mass spectrum with 100,000 raw data
points representing 200 peaks, our algorithm stored 5 parameters
per peak, and was able to reduce the memory requirement down to 1
percent.},
 author = {E. Lange},
 title = {Peak picking in mass spectra},
 url = {http://publications.imp.fu-berlin.de/344/},
 year = {2004}
}

C. Gröpl, H.-J. Prömel, A. Srivastav, “Ordered binary decision diagrams and the Shannon effect”, vol. 142, 2004.

cite this publication

@article{fu_mi_publications360,
 abstract = {We investigate the size and structure of ordered binary decision diagrams   (OBDDs) for random Boolean functions. It was known that for most
values of n, the expected OBDD size of a random Boolean function
with n variables is equal to the worst-case size up to terms of lower
order. Such a phenomenon is generally called strong Shannon effect.
Here we show that the strong Shannon effect is not valid for all
n. Instead it undergoes a certain periodic `phase transition': If
n lies within intervals of constant width around the values n = 2h
+ h, then the strong Shannon effect does not hold, whereas it does
hold outside these intervals. Our analysis provides doubly exponential
probability bounds and generalises to ordered Kronecker functional
decision diagrams (OKFDDs). },
 author = {C. Gr{\"o}pl and H.-J. Pr{\"o}mel and A. Srivastav},
 journal = {Discrete Applied Mathematics},
 keywords = {VLSI-Design and layout, Binary Decision Diagrams, Graph theory (other),     Hardware verification, Probability theory, Random graphs, Randomized
algorithms and probabilistic analysis, Theoretical computer science
(other)},
 pages = {67--85},
 title = {Ordered binary decision diagrams and the Shannon effect},
 url = {http://publications.imp.fu-berlin.de/360/},
 volume = {142},
 year = {2004}
}

S. Istrail, G. G. Sutton, L. Florea, C. M. Mobarry, R. Lippert, B. Walenz, H. Shatkay, Ian Dew, J. R. Miller, M. J. Flanigan, N. J. Edwards, R. Bolanos, D. Fasulo, B. V. Halldorsson, S. Hannenhalli, R. J. Turner, S. Yooseph, Fu Lu, D. R. Nusskern, B. C. Shue, X. H. Zheng, F. Zhong, A. L. Delcher, D. H. Huson, S. A. Kravitz, L. Mouchard, K. Reinert, K. A. Remington, A. G. Clark, M. S. Waterman, E. E. Eichler, M. D. Adams, M. W. Hunkapillar, E. W. Myers, J. C. Venter, “Whole-genome shotgun assembly and comparison of human genome assemblies”, vol. 101, iss. 7, 2004.

cite this publication

@article{fu_mi_publications369,
 author = {S. Istrail and G. G. Sutton and L. Florea and C. M. Mobarry and R. Lippert and B. Walenz and H. Shatkay and Ian Dew and J. R. Miller and M. J. Flanigan and N. J. Edwards and R. Bolanos and D. Fasulo and B. V. Halldorsson and S. Hannenhalli and R. J. Turner and S. Yooseph and Fu Lu and D. R. Nusskern and B. C. Shue and X. H. Zheng and F. Zhong and A. L. Delcher and D. H. Huson and S. A. Kravitz and L. Mouchard and K. Reinert and K. A. Remington and A. G. Clark and M. S. Waterman and E. E. Eichler and M. D. Adams and M. W. Hunkapillar and E. W. Myers and J. C. Venter},
 journal = {Proceedings of the national academy of science (PNAS)},
 number = {7},
 pages = {1916--1921},
 title = {Whole-genome shotgun assembly and comparison of human genome assemblies},
 url = {http://publications.imp.fu-berlin.de/369/},
 volume = {101},
 year = {2004}
}

G. W. Klau, S. Rahmann, A. Schliep, K. Reinert, “Optimal Robust Non-Unique Probe Selection Using Integer Linear Programming”, 2004.

cite this publication

@inproceedings{fu_mi_publications372,
 author = {G. W. Klau and S. Rahmann and A. Schliep and K. Reinert},
 booktitle = {Proceedings of the Twelfth International Conference on Intelligent  Systems for Molecular Biology (ISMB-04)},
 pages = {186--193},
 title = {Optimal Robust Non-Unique Probe Selection Using Integer Linear Programming},
 url = {http://publications.imp.fu-berlin.de/372/},
 year = {2004}
}

O. Kohlbacher, K. Reinert, “Differenzielle Proteomanalyse -- Experimentelle Methoden, Algorithmische Herausforderungen”, vol. 46, 2004.

cite this publication

@article{fu_mi_publications374,
 author = {O. Kohlbacher and K. Reinert},
 journal = {it -- Information technology},
 pages = {31--38},
 title = {Differenzielle Proteomanalyse -- Experimentelle Methoden, Algorithmische    Herausforderungen},
 url = {http://publications.imp.fu-berlin.de/374/},
 volume = {46},
 year = {2004}
}

2003

M. Bodirsky, C. Gröpl, M. Kang, “Generating labeled planar graphs uniformly at random”, iss. 2719, 2003.

cite this publication

@inproceedings{fu_mi_publications341,
 abstract = {We present an expected polynomial time algorithm to generate a labeled      planar graph uniformly at random. To generate the planar graphs,
we derive recurrence formulas that count all such graphs with n vertices
and m edges, based on a decomposition into 1-, 2-, and 3-connected
components. For 3-connected graphs we apply a recent random generation
algorithm by Schaeffer and a counting formula by Mullin and Schellenberg.},
 author = {M. Bodirsky and C. Gr{\"o}pl and M. Kang},
 booktitle = {Proceedings of ICALP 2003},
 keywords = {Planar Graph, Enumeration, Random graphs, Random Generation, Dynamic        Programming, Graph Theory},
 note = {Appeared 2008 in Theoretical Computer Science. \{ The download is   the journal submission \&lt;br\&gt; From time to time we receive requests
for source code, so here it is: See the files BGK03b.README and GBK03b.tar.bz2
\}},
 number = {2719},
 pages = {1095--1107},
 publisher = {Springer Verlag},
 series = {Lecture Notes in Computer Science},
 title = {Generating labeled planar graphs uniformly at random},
 url = {http://publications.imp.fu-berlin.de/341/},
 year = {2003}
}

M. Bodirsky, C. Gröpl, M. Kang, “Decomposing, Counting, and Generating Unlabeled Cubic Planar Graphs”, 2003.

cite this publication

@inproceedings{fu_mi_publications342,
 abstract = {We present an expected polynomial time algorithm to generate an unlabeled   connected cubic planar graph uniformly at random. We first consider
rooted cubic planar graphs, i.e., we count connected cubic planar
graphs up to isomorphisms that fix a certain directed edge. Based
on decompositions along the connectivity structure, we derive recurrence
formulas for counting the exact number of rooted cubic planar graphs.
This leads to 3-connected planar graphs, which have a unique embedding
on the sphere; but special care has to be taken for rooted graphs
that have a sense-reversing automorphism. Therefore we introduce
the concept of colored networks, which stand in bijective correspondence
to rooted graphs with given symmetries. Colored networks can again
be decomposed along their connectivity structure. For rooted 3-connected
cubic planar graphs embedded in the plane, we switch to the dual
and count rooted triangulations. All these numbers can be evaluated
in polynomial time by dynamic programming. We can use them to generate
rooted cubic planar graphs uniformly at random. To generate connected
cubic planar graphs without a root uniformly at random, we apply
rejection sampling and obtain an expected polynomial time algorithm.},
 author = {M. Bodirsky and C. Gr{\"o}pl and M. Kang},
 booktitle = {European Conference on Combinatorics, Graph Theory, and Applications        EUROCOMB'03 Prague},
 keywords = {Cubic Planar Graph, Planar Graph, Cubic Graph, Enumeration, Random  graphs, Random Generation, Dynamic Programming, Graph Theory},
 title = {Decomposing, Counting, and Generating Unlabeled Cubic Planar Graphs},
 url = {http://publications.imp.fu-berlin.de/342/},
 year = {2003}
}

C. Frömmel, Ch. Gille, A. Goede, C. Gröpl, S. Hougardy, T. Nierhoff, R. Preissner, M. Thimm, “Accelerating screening of 3D protein data with a graph theoretical approach”, vol. 19, iss. 18, 2003-12.

cite this publication

@article{fu_mi_publications346,
 abstract = {Motivation: The Dictionary of Interfaces in Proteins (DIP) is a database    collecting the 3D structure of interacting parts of proteins that
are called patches. It serves as a repository, in which patches similar
to given query patches can be found. The computation of the similarity
of two patches is time consuming and traversing the entire DIP requires
some hours. In this work we address the question of how the patches
similar to a given query can be identified by scanning only a small
part of DIP. The answer to this question requires the investigation
of the distribution of the similarity of patches. Results: The score
values describing the similarity of two patches can roughly be divided
into three ranges that correspond to different levels of spatial
similarity. Interestingly, the two iso-score lines separating the
three classes can be determined by two different approaches. Applying
a concept of the theory of random graphs reveals significant structural
properties of the data in DIP. These can be used to accelerate scanning
the DIP for patches similar to a given query. Searches for very similar
patches could be accelerated by a factor of more than 25. Patches
with a medium similarity could be found 10 times faster than by brute-force
search. 10.1093/bioinformatics/btg343},
 author = {C. Fr{\"o}mmel and Ch. Gille and A. Goede and C. Gr{\"o}pl and S. Hougardy and T. Nierhoff and R. Preissner and M. Thimm},
 journal = {Bioinformatics},
 keywords = {algorithm, proteomics, statistics},
 month = {December},
 number = {18},
 pages = {2442--2447},
 title = {Accelerating screening of 3D protein data with a graph theoretical  approach},
 url = {http://publications.imp.fu-berlin.de/346/},
 volume = {19},
 year = {2003}
}

C. Gröpl, “Algorithmen in der Bioinformatik”, 2003.

cite this publication

@misc{fu_mi_publications348,
 author = {C. Gr{\"o}pl},
 note = {Skript zu meiner Vorlesung im Wintersemester 2002/03 am Institut    f{\"u}r Informatik der Humboldt-Universit{\"a}t zu Berlin},
 title = {Algorithmen in der Bioinformatik},
 url = {http://publications.imp.fu-berlin.de/348/},
 year = {2003}
}

2002

E. Althaus, A. Caprara, H.-P. Lenhof, K. Reinert, “Multiple Sequence alignment with arbitrary gap costs: Computing an optimal solution using polyhedral combinatorics”, 2002.

cite this publication

@inproceedings{fu_mi_publications326,
 author = {E. Althaus and A. Caprara and H.-P. Lenhof and K. Reinert},
 booktitle = {Proceedings of the 1st European Conference on Computational Biology (ECCB 2002)},
 pages = {4--16},
 title = {Multiple Sequence alignment with arbitrary gap costs: Computing an  optimal solution using polyhedral combinatorics},
 url = {http://publications.imp.fu-berlin.de/326/},
 year = {2002}
}

J. A. Bailey, Z. Gu, R. A. Clark, K. Reinert, R. V. Samonte, S. S. Schwartz, M. D. Adams, E. W. Myers, P. Li, E. E. Eichler, “Recent Segmental Duplications in the Human Genome”, vol. 297, 2002.

cite this publication

@article{fu_mi_publications328,
 author = {J. A. Bailey and Z. Gu and R. A. Clark and K. Reinert and R. V. Samonte and S. S. Schwartz and M. D. Adams and E. W. Myers and P. Li and E. E. Eichler},
 journal = {Science},
 pages = {1003--1007},
 title = {Recent Segmental Duplications in the Human Genome},
 url = {http://publications.imp.fu-berlin.de/328/},
 volume = {297},
 year = {2002}
}

C. Gröpl, S. Hougardy, T. Nierhoff, H.-J. Prömel, “Steiner trees in uniformly quasi-bipartite graphs”, vol. 83, 2002.

cite this publication

@article{fu_mi_publications354,
 abstract = {The area of approximation algorithms for the Steiner tree problem   in graphs has seen continuous progress over the last years. Currently
the best approximation algorithm has a performance ratio of 1.550.
This is still far away from 1.0074, the largest known lower bound
on the achievable performance ratio. As all instances resulting from
known lower bound reductions are uniformly quasi-bipartite, it is
interesting whether this special case can be approximated better
than the general case. We present an approximation algorithm with
performance ratio 73/60 \&lt; 1.217 for the uniformly quasi-bipartite
case. This improves on the previously known ratio of 1.279 of Robins
and Zelikovsky. We use a new method of analysis that combines ideas
from the greedy algorithm for set cover with a matroid-style exchange
argument to model the connectivity constraint. As a consequence,
we are able to provide a tight instance.},
 author = {C. Gr{\"o}pl and S. Hougardy and T. Nierhoff and H.-J. Pr{\"o}mel},
 journal = {Information Processing Letters},
 keywords = {Steiner trees, Graph algorithms, Approximation algorithms},
 pages = {195--200},
 title = {Steiner trees in uniformly quasi-bipartite graphs},
 url = {http://publications.imp.fu-berlin.de/354/},
 volume = {83},
 year = {2002}
}

C. Gröpl, S. Hougardy, T. Nierhoff, H.-J. Prömel, M. Thimm, “Approximationsalgorithmen für das Steinerbaumproblem in Graphen”, 2002.

cite this publication

@misc{fu_mi_publications356,
 author = {C. Gr{\"o}pl and S. Hougardy and T. Nierhoff and H.-J. Pr{\"o}mel and M. Thimm},
 title = {Approximationsalgorithmen f{\"u}r das Steinerbaumproblem in Graphen},
 url = {http://publications.imp.fu-berlin.de/356/},
 year = {2002}
}

A. L. Halpern, D. H. Huson, K. Reinert, “Segment Match refinment and applications”, 2002.

cite this publication

@inproceedings{fu_mi_publications364,
 author = {A. L. Halpern and D. H. Huson and K. Reinert},
 booktitle = {Proceedings of the 2nd Workshop on Algorithms Bioinformatics (WABI-02)},
 pages = {126--139},
 title = {Segment Match refinment and applications},
 url = {http://publications.imp.fu-berlin.de/364/},
 year = {2002}
}

D. H. Huson, K. Reinert, E. W. Myers, “The Greedy Path-Merging Algorithm for Sequence Assembly”, vol. 49, iss. 5, 2002.

cite this publication

@article{fu_mi_publications367,
 author = {D. H. Huson and K. Reinert and E. W. Myers},
 journal = {Journal of the ACM},
 number = {5},
 pages = {603--615},
 title = {The Greedy Path-Merging Algorithm for Sequence Assembly},
 url = {http://publications.imp.fu-berlin.de/367/},
 volume = {49},
 year = {2002}
}

Richard Mural, M. D. Adams, K. Reinert, “A Comparison of Whole-Genome Shotgun-Derived Mouse Chromosome 16 and the Human Genome”, vol. 296, 2002.

cite this publication

@article{fu_mi_publications387,
 abstract = {The high degree of similarity between the mouse and human genomes is demonstrated through analysis of the sequence of mouse chromosome 16 (Mmu 16), which was obtained as part of a whole-genome shotgun assembly of the mouse genome. The mouse genome is about 10\% smaller than the human genome, owing to a lower repetitive DNA content. Comparison of the structure and protein-coding potential of Mmu 16 with that of the homologous segments of the human genome identifies regions of conserved synteny with human chromosomes (Hsa) 3, 8, 12, 16, 21, and 22. Gene content and order are highly conserved between Mmu 16 and the syntenic blocks of the human genome. Of the 731 predicted genes on Mmu 16, 509 align with orthologs on the corre- sponding portions of the human genome, 44 are likely paralogous to these genes, and 164 genes have homologs elsewhere in the human genome; there are 14 genes for which we could find no human counterpart.},
 author = {Richard Mural and M. D. Adams and K. Reinert},
 journal = {Science},
 pages = {1661--1671},
 title = {A Comparison of Whole-Genome Shotgun-Derived Mouse Chromosome 16    and the Human Genome},
 url = {http://publications.imp.fu-berlin.de/387/},
 volume = {296},
 year = {2002}
}

2001

C. Gröpl, S. Hougardy, T. Nierhoff, H.-J. Prömel, X. Cheng, D. Z. Du, “Approximation Algorithms for the Steiner Tree Problem in Graphs”, 2001.

cite this publication

@incollection{fu_mi_publications353,
 author = {C. Gr{\"o}pl and S. Hougardy and T. Nierhoff and H.-J. Pr{\"o}mel},
 booktitle = {Steiner Trees in Industry},
 editor = {X. Cheng and D. Z. Du},
 keywords = {Approximation algorithms, Combinatorial optimization, Graph algorithms,     Steiner trees},
 note = {Survey article with new proofs},
 pages = {235--279},
 publisher = {Kluwer Academic Publishers},
 title = {Approximation Algorithms for the Steiner Tree Problem in Graphs},
 url = {http://publications.imp.fu-berlin.de/353/},
 year = {2001}
}

C. Gröpl, S. Hougardy, T. Nierhoff, H.-J. Prömel, “Lower bounds for approximation algorithms for the Steiner tree problem”, 2001.

cite this publication

@inproceedings{fu_mi_publications355,
 abstract = {The Steiner tree problem asks for a shortest subgraph connecting a  given set of terminals in a graph. It is known to be APX-complete,
which means that no polynomial time approximation scheme can exist
for this problem, unless P=NP. Currently, the best approximation
algorithm for the Steiner tree problem has a performance ratio of
{\ensuremath{<}}span class='mathrm'{\ensuremath{>}}1.55{\ensuremath{<}}/span{\ensuremath{>}}, whereas the corresponding lower bound is smaller than {\ensuremath{<}}span class='mathrm'{\ensuremath{>}}1.01{\ensuremath{<}}/span{\ensuremath{>}}.
In this paper, we provide for several Steiner tree approximation
algorithms lower bounds on their performance ratio that are much
larger. For two algorithms that solve the Steiner tree problem on
quasi-bipartite instances, we even prove lower bounds that match
the upper bounds. Quasi-bipartite instances are of special interest,
as currently all known lower bound reductions for the Steiner tree
problem in graphs produce such instances.},
 author = {C. Gr{\"o}pl and S. Hougardy and T. Nierhoff and H.-J. Pr{\"o}mel},
 booktitle = {Proceedings of the 27th International Workshop on Graph-Theoretic   Concepts in Computer Science (2001)},
 keywords = {Steiner trees, Approximation algorithms, Combinatorial optimization,        Graph algorithms, Hypergraphs, set systems, and designs},
 publisher = {Springer Verlag},
 series = {LNCS},
 title = {Lower bounds for approximation algorithms for the Steiner tree      problem},
 url = {http://publications.imp.fu-berlin.de/355/},
 year = {2001}
}

C. Gröpl, H.-J. Prömel, A. Srivastav, “On the Evolution of the Worst-Case OBDD Size”, vol. 77, 2001.

cite this publication

@article{fu_mi_publications361,
 abstract = {We prove matching lower and upper bounds on the worst-case OBDD size        of a Boolean function, revealing an interesting ocillating behaviour.},
 author = {C. Gr{\"o}pl and H.-J. Pr{\"o}mel and A. Srivastav},
 journal = {Information Processing Letters},
 keywords = {Binary Decision Diagrams},
 pages = {1--7},
 title = {On the Evolution of the Worst-Case OBDD Size},
 url = {http://publications.imp.fu-berlin.de/361/},
 volume = {77},
 year = {2001}
}

D. H. Huson, A. L. Halpern, Z. Lai, E. W. Myers, K. Reinert, G. G. Sutton, “Comparing Assemblies using Fragments and Mate-pairs”, 2001.

cite this publication

@inproceedings{fu_mi_publications365,
 author = {D. H. Huson and A. L. Halpern and Z. Lai and E. W. Myers and K. Reinert and G. G. Sutton},
 booktitle = {Proceedings of the 1st Workshop on Algorithms Bioinformatics (WABI-01)},
 pages = {294--306},
 title = {Comparing Assemblies using Fragments and Mate-pairs},
 url = {http://publications.imp.fu-berlin.de/365/},
 year = {2001}
}

D. H. Huson, K. Reinert, S. A. Kravitz, K. A. Remington, A. L. Delcher, I. M. Dew, M. J. Flanigan, A. L. Halpern, Z. Lai, C. M. Mobarry, G. G. Sutton, E. W. Myers, “Design of a compartmentalized Shotgun Assembler for the Human Genome”, 2001.

cite this publication

@inproceedings{fu_mi_publications366,
 author = {D. H. Huson and K. Reinert and S. A. Kravitz and K. A. Remington and A. L. Delcher and I. M. Dew and M. J. Flanigan and A. L. Halpern and Z. Lai and C. M. Mobarry and G. G. Sutton and E. W. Myers},
 booktitle = {Proceedings of the Ninth International Conference on Intelligent    Systems for Molecular Biology (ISMB-01)},
 pages = {132--139},
 title = {Design of a compartmentalized Shotgun Assembler for the Human Genome},
 url = {http://publications.imp.fu-berlin.de/366/},
 year = {2001}
}

D. H. Huson, K. Reinert, E. W. Myers, “The Greedy Path-Merging Algorithm for Sequence Assembly”, 2001.

cite this publication

@inproceedings{fu_mi_publications368,
 author = {D. H. Huson and K. Reinert and E. W. Myers},
 booktitle = {Proceedings of the Fifth Annual International Conference on Computational   Molecular Biology (RECOMB-01)},
 pages = {157--163},
 title = {The Greedy Path-Merging Algorithm for Sequence Assembly},
 url = {http://publications.imp.fu-berlin.de/368/},
 year = {2001}
}

R. J. Turner, K. Chaturvedi, N. J. Edwards, A. L. Halpern, D. H. Huson, O. Kohlbacher, J. R. Miller, K. Reinert, K. A. Remington, R. Schwartz, B. Walenz, S. Yooseph, S. Istrail, “Visualization Challenges for a New Cyberpharmaceutical Computing”, 2001.

cite this publication

@inproceedings{fu_mi_publications410,
 author = {R. J. Turner and K. Chaturvedi and N. J. Edwards and A. L. Halpern and D. H. Huson and O. Kohlbacher and J. R. Miller and K. Reinert and K. A. Remington and R. Schwartz and B. Walenz and S. Yooseph and S. Istrail},
 booktitle = {IEEE 2001 Symposium on Parallel and Large-Data Visualization and    Graphics},
 note = {Keynote address},
 pages = {7--18},
 title = {Visualization Challenges for a New Cyberpharmaceutical Computing},
 url = {http://publications.imp.fu-berlin.de/410/},
 year = {2001}
}

J. C. Venter, M. D. Adams, E. W. Myers, K. Reinert, . et al, “The Sequence of the Human Genome”, vol. 291, iss. 5507, 2001.

cite this publication

@article{fu_mi_publications411,
 abstract = {A 2.91-billion base pair (bp) consensus sequence of the euchromatic portion of the human genome was generated by the whole-genome shotgun sequencing method. The 14.8-billion bp DNA sequence was generated over 9 months from 27,271,853 high-quality sequence reads (5.11-fold coverage of the genome) from both ends of plasmid clones made from the DNA of five individuals. Two assembly strategies{--}a whole-genome assembly and a regional chromosome assembly{--}were used, each combining sequence data from Celera and the publicly funded genome effort. The public data were shredded into 550-bp segments to create a 2.9-fold coverage of those genome regions that had been sequenced, without including biases inherent in the cloning and assembly procedure used by the publicly funded group. This brought the effective cov- erage in the assemblies to eightfold, reducing the number and size of gaps in the final assembly over what would be obtained with 5.11-fold coverage. The two assembly strategies yielded very similar results that largely agree with independent mapping data. The assemblies effectively cover the euchromatic regions of the human chromosomes. More than 90\% of the genome is in scaffold assemblies of 100,000 bp or more, and 25\% of the genome is in scaffolds of 10 million bp or larger. Analysis of the genome sequence revealed 26,588 protein-encoding transcripts for which there was strong corroborating evidence and an additional},
 author = {J. C. Venter and M. D. Adams and E. W. Myers and K. Reinert and . et al},
 journal = {Science},
 keywords = {ASSEMBLY},
 number = {5507},
 pages = {1304--1351},
 title = {The Sequence of the Human Genome},
 url = {http://publications.imp.fu-berlin.de/411/},
 volume = {291},
 year = {2001}
}

2000

“The Genome Sequence of Drosophila melanogaster”, vol. 287, iss. 5461, 2000.

cite this publication

@article{fu_mi_publications324,
 journal = {Science},
 keywords = {ASSEMBLY},
 number = {5461},
 pages = {2185--2195},
 title = {The Genome Sequence of Drosophila melanogaster},
 url = {http://publications.imp.fu-berlin.de/324/},
 volume = {287},
 year = {2000}
}

G. Baudis, C. Gröpl, S. Hougardy, T. Nierhoff, H.-J. Prömel, “Approximating Minimum Spanning Sets in Hypergraphs and Polymatroids”, 2000.

cite this publication

@techreport{fu_mi_publications329,
 author = {G. Baudis and C. Gr{\"o}pl and S. Hougardy and T. Nierhoff and H.-J. Pr{\"o}mel},
 institution = {Humboldt-University Berlin},
 keywords = {Hypergraphs, set systems, and designs, Steiner trees, Approximation algorithms, Colouring, packing and covering, Combinatorial optimization,
Graph algorithms},
 note = {This paper was already accepted for ICALP 2000 but we did not present       it since later we were informed that the main result had already
been proven in a different way.},
 title = {Approximating Minimum Spanning Sets in Hypergraphs and Polymatroids},
 type = {Technical Report},
 url = {http://publications.imp.fu-berlin.de/329/},
 year = {2000}
}

J. D. Kececioglu, H.-P. Lenhof, K. Mehlhorn, K. Reinert, M. Vingron, “A Polyhedral Approach to Sequence Alignment Problems”, vol. 104, 2000.

cite this publication

@article{fu_mi_publications370,
 author = {J. D. Kececioglu and H.-P. Lenhof and K. Mehlhorn and K. Reinert and M. Vingron},
 journal = {Discrete Applied Mathematics},
 pages = {143--186},
 title = {A Polyhedral Approach to Sequence Alignment Problems},
 url = {http://publications.imp.fu-berlin.de/370/},
 volume = {104},
 year = {2000}
}

M. Lermen, K. Reinert, “The Practical Use of the A* Algorithm for Exact Multiple Sequence Alignment”, 2000.

cite this publication

@article{fu_mi_publications383,
 author = {M. Lermen and K. Reinert},
 journal = {Journal of Computational Biology},
 pages = {655--671},
 title = {The Practical Use of the A* Algorithm for Exact Multiple Sequence   Alignment},
 url = {http://publications.imp.fu-berlin.de/383/},
 year = {2000}
}

E. W. Myers, G. G. Sutton, A. L. Delcher, D. P. Dew, M. J. Flanigan, S. A. Kravitz, C. M. Mobarry, K. Reinert, K. A. Remington, E. L. Anson, R. Bolanos, H.-H. Chou, C. M. Jordan, A. L. Halpern, S. Lonardi, E. M. Beasly, R. C. Brandon, L. Chen, P. J. Dunn, Z. Lai, Y. Liang, D. R. Nusskern, M. Zhan, Q. Zhang, X. H. Zheng, G. M. Rubin, M. D. Adams, J. C. Venter, “A Whole-Genome Assembly of Drosophila”, vol. 287, iss. 5461, 2000.

cite this publication

@article{fu_mi_publications388,
 author = {E. W. Myers and G. G. Sutton and A. L. Delcher and D. P. Dew and M. J. Flanigan and S. A. Kravitz and C. M. Mobarry and K. Reinert and K. A. Remington and E. L. Anson and R. Bolanos and H.-H. Chou and C. M. Jordan and A. L. Halpern and S. Lonardi and E. M. Beasly and R. C. Brandon and L. Chen and P. J. Dunn and Z. Lai and Y. Liang and D. R. Nusskern and M. Zhan and Q. Zhang and X. H. Zheng and G. M. Rubin and M. D. Adams and J. C. Venter},
 journal = {Science},
 keywords = {ASSEMBLY},
 number = {5461},
 pages = {2196--2203},
 title = {A Whole-Genome Assembly of Drosophila},
 url = {http://publications.imp.fu-berlin.de/388/},
 volume = {287},
 year = {2000}
}

K. Reinert, J. Stoye, T. Will, “An Iterative Methods for Faster Sum-of-Pairs Multiple Sequence Alignment”, vol. 16, iss. 9, 2000.

cite this publication

@article{fu_mi_publications400,
 author = {K. Reinert and J. Stoye and T. Will},
 journal = {BIOINFORMATICS},
 number = {9},
 pages = {808--814},
 title = {An Iterative Methods for Faster Sum-of-Pairs Multiple Sequence Alignment},
 url = {http://publications.imp.fu-berlin.de/400/},
 volume = {16},
 year = {2000}
}

1999

C. Gröpl, “Binary Decision Diagrams for Random Boolean Functions”, 1999.

cite this publication

@unpublished{fu_mi_publications349,
 abstract = {(deutsche Zusammenfassung siehe unten)\&lt;p\&gt; Binary Decision Diagrams  (BDDs) are a data structure for Boolean functions which are also
known as branching programs. In ordered binary decision diagrams
(OBDDs), the tests have to obey a fixed variable ordering. In free
binary decision diagrams (FBDDs), each variable can be tested at
most once. The efficiency of new variants of the BDD concept is usually
demonstrated with spectacular (worst-case) examples. We pursue another
approach and compare the representation sizes of almost all Boolean
functions. Whereas I. Wegener proved that for `most' values of n
the expected OBDD size of a random Boolean function of n variables
is equal to the worst-case size up to terms of lower order, we show
that this is not the case for n within intervals of constant length
around the values n = 2h + h. Furthermore, ranges of n exist for
which minimal FBDDs are almost always at least a constant factor
smaller than minimal OBDDs. Our main theorems have doubly exponentially
small probability bounds (in n). We also investigate the evolution
of random OBDDs and their worst-case size, revealing an oscillating
behaviour that explains why certain results cannot be improved in
general. \&lt;p\&gt; \&lt;b\&gt;Zusammenfassung:\&lt;/b\&gt;\&lt;p\&gt; Binary Decision Diagrams
(BDDs) sind eine Datenstruktur f{\"u}r Boolesche Funktionen, die
auch unter dem Namen branching program bekannt ist. In ordered binary
decision diagrams (OBDDs) m{\"u}ssen die Tests einer festen Variablenordnung
gen{\"u}gen. In free binary decision diagrams (FBDDs) darf jede Variable
h{\"o}chstens einmal getestet werden. Die Effizienz neuer Varianten
des BDD-Konzepts wird gew{\"o}hnlich anhand spektakul{\"a}rer (worst-case)
Beispiele aufgezeigt. Wir verfolgen einen anderen Ansatz und vergleichen
die Darstellungsgr{\"o}{\ss}en f{\"u}r fast alle Booleschen Funktionen.
W{\"a}hrend I. Wegener bewiesen hat, da{\ss} f{\"u}r die `meisten'
n die erwartete OBDD-Gr{\"o}{\ss}e einer zuf{\"a}lligen Booleschen
Funktion von n Variablen gleich der worst-case Gr{\"o}{\ss}e bis
auf Terme kleinerer Ordnung ist, zeigen wir da{\ss} dies nicht der
Fall ist f{\"u}r n innerhalb von Intervallen konstanter L{\"a}nge
um die Werte n = 2h + h. Ferner gibt es Bereiche von n, in denen
minimale FBDDs fast immer um mindestens einen konstanten Faktor kleiner
sind als minimale OBDDs. Unsere Haupts{\"a}tze ha ben doppelt exponentielle
Wahrschein- lichkeitsschranken (in n). Au{\ss}erdem untersuchen wir
die Entwicklung zuf{\"a}lliger OBDDs und ihrer worst-case Gr{\"o}{\ss}e
und decken dabei ein oszillierendes Verhalten auf, das erkl{\"a}rt,
warum gewisse Aussagen im allgemeinen nicht verst{\"a}rkt werden
k{\"o}nnen. \&lt;p\&gt;Schlagw{\"o}rter:\&lt;p\&gt; Bin{\"a}res Entscheidungsdiagramm,
Boolesche Funktion,probabilistische Analyse, Shannon Effekt.},
 author = {C. Gr{\"o}pl},
 keywords = {Binary decision diagram, Boolean function, probabilistic analysis,  Shannon effect},
 school = {Humboldt-Universit{\"a}t zu Berlin},
 title = {Binary Decision Diagrams for Random Boolean Functions},
 url = {http://publications.imp.fu-berlin.de/349/},
 year = {1999}
}

H.-P. Lenhof, B. Morgenstern, K. Reinert, “An exact solution for the segment-to-segment multiple sequence alignment problem”, vol. 15, iss. 3, 1999.

cite this publication

@article{fu_mi_publications381,
 author = {H.-P. Lenhof and B. Morgenstern and K. Reinert},
 journal = {BIOINFORMATICS},
 number = {3},
 pages = {203--210},
 title = {An exact solution for the segment-to-segment multiple sequence alignment    problem},
 url = {http://publications.imp.fu-berlin.de/381/},
 volume = {15},
 year = {1999}
}

1998

C. Gröpl, H.-J. Prömel, A. Srivastav, D. Korb, Ch. Meinel, M. Morvan, “Size and Structure of Random Ordered Binary Decision Diagrams (Extended Abstract)”, iss. 1373, 1998.

cite this publication

@inproceedings{fu_mi_publications359,
 address = {Berlin, Heidelberg, New York},
 author = {C. Gr{\"o}pl and H.-J. Pr{\"o}mel and A. Srivastav},
 booktitle = {STACS 98},
 editor = {D. Korb and Ch. Meinel and M. Morvan},
 keywords = {VLSI-Design and layout, Hardware verification, Random graphs},
 number = {1373},
 pages = {238--248},
 publisher = {Springer Verlag},
 series = {Lecture Notes in Computer Science},
 title = {Size and Structure of Random Ordered Binary Decision Diagrams (Extended     Abstract)},
 url = {http://publications.imp.fu-berlin.de/359/},
 year = {1998}
}

C. Gröpl, M. Skutella, E. W. Mayr, H.-J. Prömel, A. Steger, “Parallel Repetition of MIP(2,1) Systems”, vol. 1367, 1998.

cite this publication

@incollection{fu_mi_publications363,
 author = {C. Gr{\"o}pl and M. Skutella},
 booktitle = {Lectures on Proof Verification and Approximation Algorithms},
 editor = {E. W. Mayr and H.-J. Pr{\"o}mel and A. Steger},
 keywords = {Theoretical computer science (other), Approximation algorithms, PCP and non-approximability},
 note = {The book grow out of a Dagstuhl Seminar, April 21-25, 1997},
 pages = {161--177},
 publisher = {Springer},
 series = {Lecture Notes in Computer Science},
 title = {Parallel Repetition of MIP(2,1) Systems},
 url = {http://publications.imp.fu-berlin.de/363/},
 volume = {1367},
 year = {1998}
}

H.-P. Lenhof, K. Reinert, M. Vingron, “A polyhedral approach to RNA sequence structure alignment”, 1998.

cite this publication

@inproceedings{fu_mi_publications380,
 author = {H.-P. Lenhof and K. Reinert and M. Vingron},
 booktitle = {Proceedings of the Second Annual International Conference on Computational  Molecular Biology (RECOMB-98)},
 pages = {153--162},
 title = {A polyhedral approach to RNA sequence structure alignment},
 url = {http://publications.imp.fu-berlin.de/380/},
 year = {1998}
}

H.-P. Lenhof, K. Reinert, M. Vingron, “A Polyhedral Approach to RNA Sequence Structure Alignment”, vol. 5, iss. 3, 1998.

cite this publication

@article{fu_mi_publications382,
 author = {H.-P. Lenhof and K. Reinert and M. Vingron},
 journal = {Journal of Computational Biology},
 number = {3},
 pages = {517--530},
 title = {A Polyhedral Approach to RNA Sequence Structure Alignment},
 url = {http://publications.imp.fu-berlin.de/382/},
 volume = {5},
 year = {1998}
}

1997

M. Block, C. Gröpl, H. Preuss, H. J. Prömel, A. Srivastav, “Efficient ordering of state variables and transition relation partitions in symbolic model checking”, 1997.

cite this publication

@techreport{fu_mi_publications335,
 author = {M. Block and C. Gr{\"o}pl and H. Preuss and H. J. Pr{\"o}mel and A. Srivastav},
 institution = {Humboldt-Universit{\"a}t zu Berlin},
 keywords = {Randomized algorithms and probabilistic analysis, VLSI-Design and   layout, Binary Decision Diagrams, Hardware verification, Local search
and metaheuristics},
 title = {Efficient ordering of state variables and transition relation partitions    in symbolic model checking},
 type = {Technical Report},
 url = {http://publications.imp.fu-berlin.de/335/},
 year = {1997}
}

K. Reinert, H.-P. Lenhof, P. Mutzel, J. D. Kececioglu, “A branch-and-Cut algorithm for multiple sequence alignment ”, 1997.

cite this publication

@inproceedings{fu_mi_publications399,
 author = {K. Reinert and H.-P. Lenhof and P. Mutzel and J. D. Kececioglu},
 booktitle = {Proceedings of the First Annual International Conference on Computational   Molecular Biology (RECOMB-97)},
 pages = {241--249},
 title = {A branch-and-Cut algorithm for multiple sequence alignment },
 url = {http://publications.imp.fu-berlin.de/399/},
 year = {1997}
}

1996

P. G. Bradford, K. Reinert, “Lower Bounds for Row Minima Searching”, 1996.

cite this publication

@inproceedings{fu_mi_publications343,
 author = {P. G. Bradford and K. Reinert},
 booktitle = {Proceedings of the 23rd International Colloquium on Automata, Languages,    and Programming 1996 (ICALP-96), LNCS 1099},
 pages = {454--465},
 title = {Lower Bounds for Row Minima Searching},
 url = {http://publications.imp.fu-berlin.de/343/},
 year = {1996}
}

C. Gröpl, “Über Approximationsalgorithmen zur Färbung k-färbbarer Graphen, die vektorchromatische Zahl und andere Varianten der vartheta-Funktion”, 1996-01.

cite this publication

@unpublished{fu_mi_publications350,
 address = {Forschungsinstitut f{\"u}r Diskrete Mathematik},
 author = {C. Gr{\"o}pl},
 keywords = {Approximation algorithms, Colouring, packing and covering},
 month = {January},
 school = {Rheinische Friedrich-Wilhelms-Universit{\"a}t Bonn},
 title = {{\"U}ber Approximationsalgorithmen zur F{\"a}rbung k-f{\"a}rbbarer  Graphen, die vektorchromatische Zahl und andere Varianten der vartheta-Funktion},
 url = {http://publications.imp.fu-berlin.de/350/},
 year = {1996}
}