const MAX_CHUNK_SIZE = 1_000;

/**
 * split an input text into multiple chunks, which are small enough to be
 * handled within a single request to AWS Polly.
 */
export function splitTextIntoPollyChunks(text: string): string[] {
  let remainingText = text.replace(/\s/g, " ");
  const result: string[] = [];

  while (remainingText.length > MAX_CHUNK_SIZE) {
    // find a natural point where we can crop the input text
    let cropAt = Math.max(
      remainingText.lastIndexOf(". ", MAX_CHUNK_SIZE - 1),
      remainingText.lastIndexOf("! ", MAX_CHUNK_SIZE - 1),
      remainingText.lastIndexOf("? ", MAX_CHUNK_SIZE - 1),
      remainingText.lastIndexOf(": ", MAX_CHUNK_SIZE - 1)
    );

    // if no natural point was found, then go for the last word break
    // within the allowed chunk size
    if (cropAt === -1) {
      cropAt = remainingText.lastIndexOf(" ", MAX_CHUNK_SIZE - 1);
    }

    // if we were unable to identify a crop-point, then simply extract the
    // allowed characters, regardless of how the text will be cropped... :(
    if (cropAt === -1) {
      cropAt = MAX_CHUNK_SIZE - 1;
    }

    result.push(remainingText.substring(0, cropAt + 1));
    remainingText = remainingText.substring(cropAt + 1);
  }

  result.push(remainingText);

  // loop over all chunks, and trim the output so it's optimized for use with
  // the Polly service
  return result
    .map((chunk) =>
      chunk
        .trim()

        // this check has been disabled, as it will not always work as intended
        // (for instance in cases of "tropiske- og subtropiske plantebælter")
        //
        // while the above is gramatically incorrect, it could still be a valid
        // pattern in other sentences
        //
        // code left here for future reference, the main purporse was to join
        // words that were pusposefully hyphenated for better rendering in the
        // browser, but this would also be better handled with hyphens: auto
        // today
        //
        // see ADO #42739
        // .replace(/([^ ])-\s/g, "")
        // .replace(/-([^ ])\s/g, "")

        // remove special characters at the beginning / end of words... this is
        // a rather complicated process, the regex below is based on the
        // diacritics defined here to support other languages than danish
        // https://github.com/motss/normalize-diacritics/blob/master/src/index.ts
        .replace(
          / [^\d\w!?%&$\.:,;\-_\u0041-\u005A\u0061-\u007A\u00A0\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u0131\u0134-\u0137\u0139-\u0149\u014C-\u0183\u0186-\u018C\u018E\u0190-\u0193\u0195\u0197-\u019A\u019C-\u01A5\u01AC-\u01B0\u01B2-\u01B6\u01C4-\u01ED\u01F0-\u01F5\u01F8-\u021B\u021E-\u0220\u0222-\u0233\u023A-\u0240\u0243-\u0245\u0247-\u0250\u0253-\u0254\u0256-\u0257\u025B\u0260\u0265\u0268\u026B\u026F\u0271-\u0272\u0275\u027D\u0288-\u0289\u028B-\u028C\u07C0\u1D79\u1D7D\u1E00-\u1E9B\u1E9E\u1EA0-\u1EF9\u1EFE-\u1EFF\u2184\u24B6-\u24E9\u2C60-\u2C6C\u2C6E-\u2C6F\u2C72-\u2C73\u2C75-\u2C76\u2C7E-\u2C7F\uA728-\uA729\uA732-\uA75B\uA75E-\uA763\uA779-\uA787\uA78D\uA790-\uA791\uA7A0-\uA7A9\uFB00-\uFB04\uFF21-\uFF3A\uFF41-\uFF5A\xDF ]+([^ ])/g,
          " $1"
        )
        .replace(
          /([^ ])[^\d\w!?%&$\.:,;\-_\u0041-\u005A\u0061-\u007A\u00A0\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u0131\u0134-\u0137\u0139-\u0149\u014C-\u0183\u0186-\u018C\u018E\u0190-\u0193\u0195\u0197-\u019A\u019C-\u01A5\u01AC-\u01B0\u01B2-\u01B6\u01C4-\u01ED\u01F0-\u01F5\u01F8-\u021B\u021E-\u0220\u0222-\u0233\u023A-\u0240\u0243-\u0245\u0247-\u0250\u0253-\u0254\u0256-\u0257\u025B\u0260\u0265\u0268\u026B\u026F\u0271-\u0272\u0275\u027D\u0288-\u0289\u028B-\u028C\u07C0\u1D79\u1D7D\u1E00-\u1E9B\u1E9E\u1EA0-\u1EF9\u1EFE-\u1EFF\u2184\u24B6-\u24E9\u2C60-\u2C6C\u2C6E-\u2C6F\u2C72-\u2C73\u2C75-\u2C76\u2C7E-\u2C7F\uA728-\uA729\uA732-\uA75B\uA75E-\uA763\uA779-\uA787\uA78D\uA790-\uA791\uA7A0-\uA7A9\uFB00-\uFB04\uFF21-\uFF3A\uFF41-\uFF5A\xDF ]+ /g,
          "$1 "
        )
    )
    .filter((chunk) => chunk.length > 0);
}
