Language Studio utilizes a number of different approaches to capitalization depending on the language pair and client requirements. Some of the approaches use statistics that can be very good, but there is often a case where there is a source word that is output in the target language and you would like to match the capitalization from the source text. Often this can be important in languages such as Chinese or Japanese where English words could be in the output (for example brand names such as IBM or product names such as ProWave).
JS9 can be used to check any words in the output translation to see if the exist in the source and if found in the source to copy the capitalization.
//Language Studio Post-Processing Script //Copyright 2008-2016 Omniscien Technologies Pte Ltd. All Rights Reserved. //Use of any part of this script within non-Language Studio applications is not permited. //Description: //The script will match the capitalization of the source text when the same word is in the target. var isDebug = 0; //SET TO 0 FOR PRODUCTION var isExternalTest = 0; //SET TO 0 FOR PRODUCTION var logText = ""; var iJSNo = 9; //The number of the JS Script - some logic can leverage the number var sSourceLang = "en"; var sTargetLang = "ja"; var sLangPair = ""; //Detects if we are running in the Language Studio Enterprise runtime environment var bIsDev = ((typeof AOSession) == "undefined"); if (bIsDev) { //We isDebug = 1; isExternalTest = 1; //Set the paths to load the input files from for testing in Microsoft Visual Studio debugger. var sTestInFilePathSource = currentPath() + "\\In" + iJSNo + "." + sSourceLang + ".source.txt"; var sTestInFilePathTarget = currentPath() + "\\In" + iJSNo + "." + sTargetLang + ".target.txt"; var sTestOutFilePath = currentPath() + "\\Out" + iJSNo + "." + sTargetLang + ".txt"; } //Add prototypes if needed if (!String.prototype.trim) { String.prototype.trim = function () { return this.replace(/^\s+|\s+$/g, ''); } } function main(sAllSourceSegments, sAllTargetSegments) { //Initialize log("main (start)"); initializeLang(); //Process the all segments as 1 block of text // sAllTargetSegments = processAllSegments(sAllSourceSegments, sAllTargetSegments); //Process each segment line one at a time sAllTargetSegments = processEachSegment(sAllSourceSegments, sAllTargetSegments); //Cleanup sAllTargetSegments = processFinalCleanup(sAllTargetSegments) log("main (exit)"); // Return output return sAllTargetSegments; } //Process all segments together as one string function processAllSegments(sAllSourceSegments, sAllTargetSegments) { log("processAllSegments (start)"); //Your code here log("processAllSegments (exit)"); return sAllTargetSegments; } //A crude form of tokenize that is only suitable for simple tasks of splitting up data for quick analysis. function basicTokenize(sIn) { var sOut = sIn; switch (sSourceLang.toLowerCase()) { case "zh": case "ja": case "ko": //Remove the Han text - to Tokenize these languages, need to word segment sOut = sOut.replace(RegExp(/([\u4e00-\u9fa5]{1,})/gm), " "); //Replaces all Han text with a space //NOTE: Not to be used if you need the Han characters break; } sOut = sOut.replace(RegExp(/((?![\._,\-@])\p{P})/g), " $1 "); //Any punctuation except for .,_-@ //Handle commas sOut = sOut.replace(RegExp(/([a-zA-Z][ ]{0,}),([ ]{0,}[a-zA-Z0-9])/g), "$1 , $2"); sOut = sOut.replace(RegExp(/([a-zA-Z0-9][ ]{0,}),([ ]{0,}[a-zA-Z])/g), "$1 , $2"); sOut = sOut.replace(RegExp(/([ ]{2,})/gi), " "); //Remove multiple spaces sOut = sOut.trim(); return sOut; } function copySourceCapitalization(sSourceSegment, sTargetSegment) { var bChange = false; //Copy the source capitalization sSourceSegment = basicTokenize(sSourceSegment); sTargetSegment = sTargetSegment.replace(RegExp(/([ ]{2,})/gmi), " "); sTargetSegment = sTargetSegment.trim(); switch (sTargetLang.toLowerCase()) { case "zh": case "ko": case "ja": case "th": case "my": case "ar": //For non-romanized languages match the capitalization of all EN in the source var aTarget = sTargetSegment.split(' '); for (var i in aTarget) { //Find in source sWord = aTarget[i]; //***Add your own language specific filters here if necessary. //Only process if there are letters if (sWord.match(new RegExp("[a-z]", "")) != null) { //Look for target word in source text var pattern = new RegExp("(^|[ ]|\\p{P})(" + sWord + ")($|[ ]|\\p{P})", "i"); var match = pattern.exec(sSourceSegment); if (match != null) { //Compare capitalization if (match[2] != sWord) { aTarget[i] = match[2]; bChange = true; } } } } if (bChange) { //There has been a change, so rebuid the target with the adjusted format sTargetSegment = aTarget.join(' '); } break; default: //Romanized languages var aTarget = sTargetSegment.split(' '); for (var i in aTarget) { //Find in source sTargetWord = aTarget[i]; //***Add your own language specific filters here if necessary. //Only process if there are letters if (sTargetWord.match(/[A-Za-z]/)) { //Look for target word in source text var pattern = new RegExp("(^|[ ]|\\p{P})(" + sTargetWord + ")($|[ ]|\\p{P})", "i"); var match = pattern.exec(sSourceSegment); if (match != null) { //Compare capitalization if (match[2] != sTargetWord) { aTarget[i] = match[2]; bChange = true; } } } } if (bChange) { //If there has been a change, then rebuild the segment sTargetSegment = aTarget.join(' '); } } return sTargetSegment; } //Processes a single segment function processSegment(sSourceSegment, sTargetSegment) { //Copy capitalization from the source text to the target when matching words are found in the target segment sTargetSegment = copySourceCapitalization(sSourceSegment, sTargetSegment); return sTargetSegment; } //Final functions to make sure that all data is valid and not small issues inserted such as BOM and multiple spaces function processFinalCleanup(sAllTargetSegments) { log("processFinalCleanup (start)"); try { //Safety checks - unlikely, but good practice to be sure. sAllTargetSegments = removeBOM(sAllTargetSegments); //Remove BOM sAllTargetSegments = sAllTargetSegments.replace(/([ ]{2,})/gm, " "); //Multiple spaces sAllTargetSegments = sAllTargetSegments.replace(/([ ]{0,}\n[ ]{0,})/gm, "\n"); //Trim each line //Your Code Here } catch (e) { logError(e); } log("processFinalCleanup (exit)"); return sAllTargetSegments; } //----------------------------------------------------------------- //Standard functions do not modify below this point //----------------------------------------------------------------- //Loop through each segment once at a time function processEachSegment(sAllSourceSegments, sAllTargetSegments) { //Line break must be \n not \r\n var aSourceSegments = sAllSourceSegments.split("\n"); var aTargetSegments = sAllTargetSegments.split("\n"); for (var i in aSourceSegments) { //Process segment aTargetSegments[i] = processSegment(aSourceSegments[i], aTargetSegments[i]); } sAllTargetSegments = aTargetSegments.join("\n"); return sAllTargetSegments; } //Removes a Byte Order Marker (BOM) from the start of data function removeBOM(sIn) { var sBOM = "\xef\xbb\xbf"; //Check for the BOM at the start of the data if (sIn.indexOf(sBOM) == 0) { sIn = sIn.substring(sBOM.length, sIn.length - sBOM.length); } return sIn; } //Counts how many time a string occurs in another string function stringCount(sIn, sCount) { return sIn.split(sCount).length - 1; } //Sets up language variables function initializeLang() { sSourceLang = sSourceLang.toLowerCase(); sTargetLang = sTargetLang.toLowerCase(); sLangPair = sSourceLang + '-' + sTargetLang; } //Writes a line to the log output function log(sText) { if (isDebug == 1) { var now = new Date(); logText += "\r\n" + formatLogDate(now) + "\t" + sText; } } function formatLogDate(dDate) { //zero-pad a single zero if needed var zp = function (val) { return (val <= 9 ? '0' + val : '' + val); } //zero-pad up to two zeroes if needed var zp2 = function (val) { return val <= 99 ? (val <= 9 ? '00' + val : '0' + val) : ('' + val); } return '' + dDate.getFullYear() + '-' + zp(dDate.getMonth() + 1) + '-' + zp(dDate.getDate()) + ' ' + zp(dDate.getHours()) + ':' + zp(dDate.getMinutes()) + ':' + zp(dDate.getSeconds()) + '.' + zp2(dDate.getMilliseconds()); } function setScriptNo() { //Autodetect the script number from the name if (bIsDev) { var sNum = WScript.ScriptName.toString().replace(RegExp(/(.{0,})(\d{1,})(.{0,})/gmi), "$2"); iJSNo = parseInt(sNum); } else { if (iJSNo == 0) { iJSNo = parseInt(AOSession.getValue("ScriptNo")); } } } //WINDOWS TEST FUNCTIONS - For use with WScript if (isExternalTest) ExternalTest(); function ExternalTest() { if (isDebug) WScript.Echo('--Start Test--'); //Read input from a file var sAllSourceSegments = fileToString(sTestInFilePathSource); var sAllTargetSegments = fileToString(sTestInFilePathTarget); //Normalize CRLF to \n sAllSourceSegments = sAllSourceSegments.replace(RegExp(/\r\n|\r/g), "\n"); //Execute test as a normal pre-processing step would execute it. var sOut = main(sAllSourceSegments, sAllTargetSegments); //Write the output to a file stringToFile(sTestOutFilePath, sOut); if (isDebug) { if (logText.length > 0) { WScript.Echo(logText); } WScript.Echo('--End Test--'); } } function currentPath() { return new ActiveXObject("WScript.Shell").CurrentDirectory; } //Load the UTF-8 file into a string function fileToString(sFilePath) { //Relies on Microsoft ActiveX Data Objects (ADO) which much be installed - var oStream = new ActiveXObject("ADODB.Stream"); oStream.CharSet = "utf-8"; oStream.Open; oStream.LoadFromFile(sFilePath); var sData = removeBOM(oStream.ReadText()); oStream.Close(); return sData; } //Write a string out to a UTF-8 file function stringToFile(sFilePath, sData) { //Relies on Microsoft ActiveX Data Objects (ADO) which much be installed - var oStream = new ActiveXObject("ADODB.Stream"); oStream.CharSet = "utf-8"; oStream.Open oStream.WriteText(sData); oStream.SaveToFile(sFilePath, 2); oStream.Flush(); oStream.Close(); }