Matching Source Capitalization :

Language Studio utilizes a number of different approaches to capitalization depending on the language pair and client requirements. Some of the approaches use statistics that can be very good, but there is often a case where there is a source word that is output in the target language and you would like to match the capitalization from the source text. Often this can be important in languages such as Chinese or Japanese where English words could be in the output (for example brand names such as IBM or product names such as ProWave).
JS9 can be used to check any words in the output translation to see if the exist in the source and if found in the source to copy the capitalization.
//Language Studio Post-Processing Script
//Copyright 2008-2016 Omniscien Technologies Pte Ltd. All Rights Reserved.
//Use of any part of this script within non-Language Studio applications is not permited.

//Description:
//The script will match the capitalization of the source text when the same word is in the target.

var isDebug = 0; //SET TO 0 FOR PRODUCTION
var isExternalTest = 0; //SET TO 0 FOR PRODUCTION
var logText = "";
var iJSNo = 9; //The number of the JS Script - some logic can leverage the number
var sSourceLang = "en";
var sTargetLang = "ja";
var sLangPair = "";

//Detects if we are running in the Language Studio Enterprise runtime environment
var bIsDev = ((typeof AOSession) == "undefined");
if (bIsDev) {
    //We 
    isDebug = 1;
    isExternalTest = 1;
    //Set the paths to load the input files from for testing in Microsoft Visual Studio debugger. 
    var sTestInFilePathSource = currentPath() + "\\In" + iJSNo + "." + sSourceLang + ".source.txt";
    var sTestInFilePathTarget = currentPath() + "\\In" + iJSNo + "." + sTargetLang + ".target.txt";
    var sTestOutFilePath = currentPath() + "\\Out" + iJSNo + "." + sTargetLang + ".txt";
}

//Add prototypes if needed
if (!String.prototype.trim) {
    String.prototype.trim = function () {
        return this.replace(/^\s+|\s+$/g, '');
    }
}

function main(sAllSourceSegments, sAllTargetSegments) {

    //Initialize
    log("main (start)");
    initializeLang();

    //Process the all segments as 1 block of text
    //    sAllTargetSegments = processAllSegments(sAllSourceSegments, sAllTargetSegments);

    //Process each segment line one at a time
    sAllTargetSegments = processEachSegment(sAllSourceSegments, sAllTargetSegments);

    //Cleanup
    sAllTargetSegments = processFinalCleanup(sAllTargetSegments)

    log("main (exit)");

    // Return output
    return sAllTargetSegments;
}

//Process all segments together as one string
function processAllSegments(sAllSourceSegments, sAllTargetSegments) {
    log("processAllSegments (start)");

    //Your code here

    log("processAllSegments (exit)");

    return sAllTargetSegments;
}

//A crude form of tokenize that is only suitable for simple tasks of splitting up data for quick analysis. 
function basicTokenize(sIn) {
    var sOut = sIn;

    switch (sSourceLang.toLowerCase()) {
        case "zh":
        case "ja":
        case "ko":
            //Remove the Han text - to Tokenize these languages, need to word segment 
            sOut = sOut.replace(RegExp(/([\u4e00-\u9fa5]{1,})/gm), " "); //Replaces all Han text with a space
            //NOTE: Not to be used if you need the Han characters
            break;
    }
    sOut = sOut.replace(RegExp(/((?![\._,\-@])\p{P})/g), " $1 "); //Any punctuation except for .,_-@

    //Handle commas
    sOut = sOut.replace(RegExp(/([a-zA-Z][ ]{0,}),([ ]{0,}[a-zA-Z0-9])/g), "$1 , $2");
    sOut = sOut.replace(RegExp(/([a-zA-Z0-9][ ]{0,}),([ ]{0,}[a-zA-Z])/g), "$1 , $2");

    sOut = sOut.replace(RegExp(/([ ]{2,})/gi), " "); //Remove multiple spaces

    sOut = sOut.trim();

    return sOut;
}

function copySourceCapitalization(sSourceSegment, sTargetSegment) {
    var bChange = false;

    //Copy the source capitalization 
    sSourceSegment = basicTokenize(sSourceSegment);
    sTargetSegment = sTargetSegment.replace(RegExp(/([ ]{2,})/gmi), " ");
    sTargetSegment = sTargetSegment.trim();

    switch (sTargetLang.toLowerCase()) {
        case "zh":
        case "ko":
        case "ja":
        case "th":
        case "my":
        case "ar":
            //For non-romanized languages match the capitalization of all EN in the source
            var aTarget = sTargetSegment.split(' ');
            for (var i in aTarget) {
                //Find in source
                sWord = aTarget[i];
                //***Add your own language specific filters here if necessary.
                //Only process if there are letters
                if (sWord.match(new RegExp("[a-z]", "")) != null) {
                    //Look for target word in source text
                    var pattern = new RegExp("(^|[ ]|\\p{P})(" + sWord + ")($|[ ]|\\p{P})", "i");
                    var match = pattern.exec(sSourceSegment);
                    if (match != null) {
                        //Compare capitalization
                        if (match[2] != sWord) {
                            aTarget[i] = match[2];
                            bChange = true;
                        }
                    }
                }
            }
            if (bChange) {
                //There has been a change, so rebuid the target with the adjusted format
                sTargetSegment = aTarget.join(' ');
            }

            break;
        default: //Romanized languages
            var aTarget = sTargetSegment.split(' ');
            for (var i in aTarget) {
                //Find in source
                sTargetWord = aTarget[i];
                //***Add your own language specific filters here if necessary.
                //Only process if there are letters
                if (sTargetWord.match(/[A-Za-z]/)) {
                    //Look for target word in source text
                    var pattern = new RegExp("(^|[ ]|\\p{P})(" + sTargetWord + ")($|[ ]|\\p{P})", "i");
                    var match = pattern.exec(sSourceSegment);
                    if (match != null) {
                        //Compare capitalization
                        if (match[2] != sTargetWord) {
                            aTarget[i] = match[2];
                            bChange = true;
                        }
                    }
                }
            }
            if (bChange) {
                //If there has been a change, then rebuild the segment
                sTargetSegment = aTarget.join(' ');
            }
    }

    return sTargetSegment;
}

//Processes a single segment
function processSegment(sSourceSegment, sTargetSegment) {

    //Copy capitalization from the source text to the target when matching words are found in the target segment
    sTargetSegment = copySourceCapitalization(sSourceSegment, sTargetSegment);

    return sTargetSegment;
}

//Final functions to make sure that all data is valid and not small issues inserted such as BOM and multiple spaces
function processFinalCleanup(sAllTargetSegments) {
    log("processFinalCleanup (start)");

    try {
        //Safety checks - unlikely, but good practice to be sure.
        sAllTargetSegments = removeBOM(sAllTargetSegments); //Remove BOM
        sAllTargetSegments = sAllTargetSegments.replace(/([ ]{2,})/gm, " "); //Multiple spaces
        sAllTargetSegments = sAllTargetSegments.replace(/([ ]{0,}\n[ ]{0,})/gm, "\n"); //Trim each line

        //Your Code Here

    }
    catch (e) {
        logError(e);
    }

    log("processFinalCleanup (exit)");

    return sAllTargetSegments;
}

//-----------------------------------------------------------------
//Standard functions do not modify below this point
//-----------------------------------------------------------------

//Loop through each segment once at a time
function processEachSegment(sAllSourceSegments, sAllTargetSegments) {
    //Line break must be \n not \r\n
    var aSourceSegments = sAllSourceSegments.split("\n");
    var aTargetSegments = sAllTargetSegments.split("\n");
    for (var i in aSourceSegments) {
        //Process segment
        aTargetSegments[i] = processSegment(aSourceSegments[i], aTargetSegments[i]);
    }
    sAllTargetSegments = aTargetSegments.join("\n");
    return sAllTargetSegments;
}

//Removes a Byte Order Marker (BOM) from the start of data
function removeBOM(sIn) {
    var sBOM = "\xef\xbb\xbf";

    //Check for the BOM at the start of the data
    if (sIn.indexOf(sBOM) == 0) {
        sIn = sIn.substring(sBOM.length, sIn.length - sBOM.length);
    }
    return sIn;
}

//Counts how many time a string occurs in another string
function stringCount(sIn, sCount) {
    return sIn.split(sCount).length - 1;
}

//Sets up language variables
function initializeLang() {
    sSourceLang = sSourceLang.toLowerCase();
    sTargetLang = sTargetLang.toLowerCase();
    sLangPair = sSourceLang + '-' + sTargetLang;
}

//Writes a line to the log output
function log(sText) {
    if (isDebug == 1) {
        var now = new Date();
        logText += "\r\n" + formatLogDate(now) + "\t" + sText;
    }
}

function formatLogDate(dDate) {
    //zero-pad a single zero if needed
    var zp = function (val) {
        return (val <= 9 ? '0' + val : '' + val);
    }

    //zero-pad up to two zeroes if needed
    var zp2 = function (val) {
        return val <= 99 ? (val <= 9 ? '00' + val : '0' + val) : ('' + val);
    }

    return '' + dDate.getFullYear() + '-' + zp(dDate.getMonth() + 1) + '-' + zp(dDate.getDate()) + ' ' + zp(dDate.getHours()) + ':' + zp(dDate.getMinutes()) + ':' + zp(dDate.getSeconds()) + '.' + zp2(dDate.getMilliseconds());
}

function setScriptNo() {
    //Autodetect the script number from the name
    if (bIsDev) {
        var sNum = WScript.ScriptName.toString().replace(RegExp(/(.{0,})(\d{1,})(.{0,})/gmi), "$2");
        iJSNo = parseInt(sNum);
    }
    else {
        if (iJSNo == 0) {
            iJSNo = parseInt(AOSession.getValue("ScriptNo"));
        }
    }
}

//WINDOWS TEST FUNCTIONS - For use with WScript
if (isExternalTest) ExternalTest();

function ExternalTest() {
    if (isDebug) WScript.Echo('--Start Test--');

    //Read input from a file
    var sAllSourceSegments = fileToString(sTestInFilePathSource);
    var sAllTargetSegments = fileToString(sTestInFilePathTarget);

    //Normalize CRLF to \n
    sAllSourceSegments = sAllSourceSegments.replace(RegExp(/\r\n|\r/g), "\n");

    //Execute test as a normal pre-processing step would execute it. 
    var sOut = main(sAllSourceSegments, sAllTargetSegments);

    //Write the output to a file
    stringToFile(sTestOutFilePath, sOut);

    if (isDebug) {
        if (logText.length > 0) {
            WScript.Echo(logText);
        }
        WScript.Echo('--End Test--');
    }
}

function currentPath() {
    return new ActiveXObject("WScript.Shell").CurrentDirectory;
}

//Load the UTF-8 file into a string
function fileToString(sFilePath) {
    //Relies on  Microsoft ActiveX Data Objects (ADO) which much be installed - 
    var oStream = new ActiveXObject("ADODB.Stream");
    oStream.CharSet = "utf-8";
    oStream.Open;
    oStream.LoadFromFile(sFilePath);
    var sData = removeBOM(oStream.ReadText());
    oStream.Close();
    return sData;
}

//Write a string out to a UTF-8 file
function stringToFile(sFilePath, sData) {
    //Relies on  Microsoft ActiveX Data Objects (ADO) which much be installed - 
    var oStream = new ActiveXObject("ADODB.Stream");
    oStream.CharSet = "utf-8";
    oStream.Open
    oStream.WriteText(sData);
    oStream.SaveToFile(sFilePath, 2);
    oStream.Flush();
    oStream.Close();
}
How can we help you today?

Matching Source Capitalization Print

How can we help you today?

Matching Source Capitalization Print

Related Articles