Speech Marks

The speech marks returned with every synthesis request are a mapping between time and text. It informs the client when each word is spoken in the audio. This can be used for text highlighting, seeking, tracking usage, etc.

type Chunk = {
  start_time: number
  end_time: number
  start: number
  end: number
  value: string
}

type NestedChunk = Chunk & {
  chunks: Chunk[]
}

Typical Gotchas

  • The values are returned based on the SSML so any escaping of &, < and > will be present in the value, start and end fields. You may consider using string tracker library to assist in the mapping.
  • The start and end values of each word may have gaps. If you're looking for the word at an index, look for the start being >= yourIndex. Rather than checking if the index is within the bounds of both start and end
  • The start_time and end_time of each word may have gaps. Follow the same advice as above
  • The start_time of the first word is not necessarily 0 like the NestedChunk. There can be silence at the beginning of the sentence that leads to the word starting part way through.
  • The end_time of the last word does not necessarily correspond with the end of the NestedChunk. There can be silence on the end of the NestedChunk that will lead it to be longer.

Example output

const chunk: NestedChunk = {
  start: 0,
  end: 79,
  start_time: 0,
  end_time: 4292.58,
  value: 'This is a sentence used for testing with some text on the end to make it longer',
  chunks: [
    {
      start: 0,
      end: 4,
      start_time: 125,
      end_time: 250,
      value: 'This',
    },
    {
      start: 5,
      end: 7,
      start_time: 250,
      end_time: 375,
      value: 'is',
    },
    {
      start: 8,
      end: 9,
      start_time: 375,
      end_time: 500,
      value: 'a',
    },
    {
      start: 10,
      end: 18,
      start_time: 500,
      end_time: 937,
      value: 'sentence',
    },
    {
      start: 19,
      end: 23,
      start_time: 937,
      end_time: 1200,
      value: 'used',
    },
    {
      start: 24,
      end: 27,
      start_time: 1200,
      end_time: 1375,
      value: 'for',
    },
    {
      start: 28,
      end: 35,
      start_time: 1375,
      end_time: 1775,
      value: 'testing',
    },
    {
      start: 36,
      end: 40,
      start_time: 1775,
      end_time: 1937,
      value: 'with',
    },
    {
      start: 41,
      end: 45,
      start_time: 1937,
      end_time: 2125,
      value: 'some',
    },
    {
      start: 46,
      end: 50,
      start_time: 2125,
      end_time: 2500,
      value: 'text',
    },
    {
      start: 51,
      end: 53,
      start_time: 2500,
      end_time: 2625,
      value: 'on',
    },
    {
      start: 54,
      end: 57,
      start_time: 2625,
      end_time: 2850,
      value: 'the',
    },
    {
      start: 58,
      end: 61,
      start_time: 2850,
      end_time: 3000,
      value: 'end',
    },
    {
      start: 62,
      end: 64,
      start_time: 3000,
      end_time: 3125,
      value: 'to',
    },
    {
      start: 65,
      end: 69,
      start_time: 3125,
      end_time: 3312,
      value: 'make',
    },
    {
      start: 70,
      end: 72,
      start_time: 3312,
      end_time: 3437,
      value: 'it',
    },
    {
      start: 73,
      end: 79,
      start_time: 3437,
      end_time: 4292.58,
      value: 'longer',
    },
  ],
}