{"id": "handle/11234/1-5946", "created": "2026-01-16T13:22:21.596419+00:00", "updated": "2026-01-16T15:29:21.033665+00:00", "links": {"self": "https://nma.eosc.cz/api/datasets/handle/11234/1-5946", "parent": "https://nma.eosc.cz/api/datasets/8fh0n-9ha97", "latest": "https://nma.eosc.cz/api/datasets/handle/11234/1-5946/versions/latest", "record": "https://nma.eosc.cz/api/datasets/handle/11234/1-5946", "versions": "https://nma.eosc.cz/api/datasets/handle/11234/1-5946/versions", "self_html": "https://nma.eosc.cz/datasets/records/handle/11234/1-5946", "latest_html": "https://nma.eosc.cz/datasets/records/handle/11234/1-5946/latest", "self_iiif_manifest": "https://nma.eosc.cz/api/iiif/record:handle/11234/1-5946/manifest", "self_iiif_sequence": "https://nma.eosc.cz/api/iiif/record:handle/11234/1-5946/sequence/default", "files": "https://nma.eosc.cz/api/datasets/handle/11234/1-5946/files", "media_files": "https://nma.eosc.cz/api/datasets/handle/11234/1-5946/media-files/files", "archive": "https://nma.eosc.cz/api/datasets/handle/11234/1-5946/files-archive", "archive_media": "https://nma.eosc.cz/api/datasets/handle/11234/1-5946/media-files/files-archive", "access_links": "https://nma.eosc.cz/api/records/handle/11234/1-5946/access/links", "access_grants": "https://nma.eosc.cz/api/records/handle/11234/1-5946/access/grants", "access_users": "https://nma.eosc.cz/api/records/handle/11234/1-5946/access/users", "access_groups": "https://nma.eosc.cz/api/records/handle/11234/1-5946/access/groups", "access_request": "https://nma.eosc.cz/api/records/handle/11234/1-5946/access/request", "access": "https://nma.eosc.cz/api/records/handle/11234/1-5946/access", "self_persistent_html": "https://nma.eosc.cz/s/handle/11234/1-5946"}, "revision_id": 5, "parent": {"id": "8fh0n-9ha97", "access": {"owned_by": {"user": "system"}, "settings": {"allow_user_requests": false, "allow_guest_requests": false, "accept_conditions_text": null, "secret_link_expiration": 0}}, "communities": {}, "pids": {}}, "versions": {"is_latest": true, "index": 1}, "is_published": true, "is_draft": false, "$schema": "local://datasets-v1.0.0.json", "metadata": {"publication_date": "2025-06-27", "persistent_url": "https://hdl.handle.net/11234/1-5946", "resource_type": {"id": "publication", "title": {"cs": "Publikace", "de": "Publikation", "en": "Publication", "es": "Publicaci\u00f3n", "sv": "Publikation"}}, "creators": [{"person_or_org": {"type": "personal", "name": "Stankov, Vladislav", "given_name": "Vladislav", "family_name": "Stankov"}}, {"person_or_org": {"type": "personal", "name": "Kopp, Maty\u00e1\u0161", "given_name": "Maty\u00e1\u0161", "family_name": "Kopp"}}, {"person_or_org": {"type": "personal", "name": "Bojar, Ond\u0159ej", "given_name": "Ond\u0159ej", "family_name": "Bojar"}}], "title": "ParCzech4Speech 1.0", "publisher": "Charles University, Faculty of Mathematics and Physics, Institute of Formal and Applied Linguistics (UFAL)", "subjects": [{"subject": "Speech corpus"}, {"subject": "Czech"}, {"subject": "Text-speech alignment"}], "languages": [{"id": "ces", "title": {"en": "Czech"}}], "rights": [{"title": {"en": "http://creativecommons.org/licenses/by/4.0/"}}], "description": "We introduce ParCzech4Speech 1.0, a processed version of the ParCzech 4.0 corpus, targeted at speech modeling tasks with the largest variant containing 2,695 hours of aligned speech from 587 speakers. We combined the sound recordings of the Czech parliamentary speeches with the official transcripts. The recordings were processed with WhisperX and Wav2Vec 2.0 to extract automated audio-text alignment. \n\nThe dataset is offered in three flexible variants: \n(1) sentence-segmented for automatic speech recognition and speech synthesis tasks with clean boundaries, \n(2) unsegmented preserving original utterance flow across sentences, and \n(3) a raw-alignment for further custom refinement for other possible tasks.\n\nNote: This release contains alignment data and text segments (official and recognized transcripts). The source audio must be obtained separately from the AudioPSP 24.01 corpus , using the 'filePath' column to locate the corresponding audio file and the 'start'/ 'end' timestamps to extract specific segments.\n\nThe official transcripts are available in ParCzech 4.0 corpus (http://hdl.handle.net/11234/1-5360).\nThe original audio files are available in AudioPSP 24.01 corpus (http://hdl.handle.net/11234/1-5404).\n\nNote: All three variants are provided in both .tsv (tab-separated values) and .parquet (columnar binary) formats. The data content is identical across formats.", "funding": [{"funder": {"name": "NPO (EC NextGenEU RRF)"}, "award": {"number": "MPO 60273/24/21300/21000", "title": {"en": "CEDMO 2.0 NPO"}}}, {"funder": {"name": "M\u0160MT OP JAK Mezisektorov\u00e1 spolupr\u00e1ce"}, "award": {"number": "CZ.02.01.01/00/23_020/0008518", "title": {"en": "Jazykov\u011bda, um\u011bl\u00e1 inteligence a jazykov\u00e9 a \u0159e\u010dov\u00e9 technologie: od v\u00fdzkumu k aplikac\u00edm"}}}]}, "files": {"enabled": false, "order": [], "count": 0, "total_bytes": 0, "entries": {}}, "pids": {"oai": {"identifier": "oai:https://nma.eosc.cz:handle/11234/1-5946", "provider": "oai"}}, "access": {"record": "public", "files": "public", "embargo": {"active": false, "reason": null}, "status": "metadata-only"}, "media_files": {"enabled": false, "order": [], "count": 0, "total_bytes": 0, "entries": {}}, "status": "published", "deletion_status": {"is_deleted": false, "status": "P"}, "stats": {"this_version": {"views": 0, "unique_views": 0, "downloads": 0, "unique_downloads": 0, "data_volume": 0.0}, "all_versions": {"views": 0, "unique_views": 0, "downloads": 0, "unique_downloads": 0, "data_volume": 0.0}}, "custom_fields": {}}