Skip to content

Commit

Permalink
issue #58 fix chinese translations,detection using unicode values
Browse files Browse the repository at this point in the history
  • Loading branch information
parthosa committed Mar 8, 2017
1 parent 6025d59 commit 0859fd8
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 11 deletions.
98 changes: 88 additions & 10 deletions lib/scripts/mtw.js
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,14 @@ export class ContentScript {
paragraphs = document.getElementsByTagName('p');
console.log('Getting words from all ' + paragraphs.length + ' paragraphs');
for (var i = 0; i < paragraphs.length; i++) {
var words = paragraphs[i].innerText.split(/\s|,|[.()]|\d/g);
var words = paragraphs[i].innerText;
if(this.clkTest(words)){
words = words.replace(/\d|\s|[()]/g,'').split('').filter(v=>v!='');
}
else{
words = words.split(/\s|,|[.()]|\d/g);
}
// console.log(words);
for (var j = 0; j < words.length; j++) {
for (var b = ngramMin; b <= ngramMax; b++) {
var word = words.slice(j, j + b).join(' ');
Expand Down Expand Up @@ -179,11 +186,16 @@ export class ContentScript {
var blackListReg = new RegExp(userBlacklistedWords);
var punctuationReg = new RegExp(/[\.,\/#\!\$%\^&\*;:{}=\\\_`~()\?@\d\+\-]/g);
var countedWordsList = this.shuffle(this.toList(countedWords, (word, count) => {
return !!word && word.length >= 2 && // no words that are too short
word !== '' && !/\d/.test(word) && // no empty words
word.charAt(0) !== word.charAt(0).toUpperCase() && // no proper nouns
!blackListReg.test(word.toLowerCase()) && // no blacklisted words
!punctuationReg.test(word.toLowerCase()); // no punctuation marks
if(this.clkTest(word))

return !!word && word !== '' && !/\d/.test(word) && // no empty words
!blackListReg.test(word.toLowerCase()) && // no blacklisted words
!punctuationReg.test(word.toLowerCase()); // no punctuation marksreturn !!word && word.length >= 2 && // no words that are too short
else
return word !== '' && !/\d/.test(word) && // no empty words
word.charAt(0) !== word.charAt(0).toUpperCase() && // no proper nouns
!blackListReg.test(word.toLowerCase()) && // no blacklisted words
!punctuationReg.test(word.toLowerCase()); // no punctuation marks
}));
var targetLength = Math.floor((Object.keys(countedWordsList).length * translationProbability) / 100);
return this.toMap(countedWordsList.slice(0, targetLength - 1));
Expand Down Expand Up @@ -280,7 +292,6 @@ export class ContentScript {
}
}
});

if (Object.keys(filteredTMap).length !== 0) {
var paragraphs = document.getElementsByTagName('p');
if (this.oneWordTranslation) {
Expand All @@ -294,6 +305,8 @@ export class ContentScript {
}
}

// console.log(filteredTMap);

// Add event listener to each word for toggle
var translatedWords = document.querySelectorAll('.mtwTranslatedWord, .mtwTranslatedWorde, .mtwTranslatedWordn, .mtwTranslatedWordh');
for (let i = 0; i < translatedWords.length; i++) {
Expand Down Expand Up @@ -395,11 +408,33 @@ export class ContentScript {
deepHTMLReplacement(node, tMap, iTMap) {
var badTags = ['TEXTAREA', 'INPUT', 'SCRIPT', 'CODE', 'A', 'SPAN'];
if (node.nodeType === Node.TEXT_NODE) {
var newNodeValue = this.replaceAll(node.nodeValue, tMap);
var newNodeValue;
if(this.targetLanguage == "zh"){
newNodeValue = this.replaceAll(node.nodeValue, tMap);
}
else{
if(this.clkTest(node.nodeValue)){
newNodeValue = this.replaceAllClk(node.nodeValue, tMap);
}
else{
newNodeValue = this.replaceAll(node.nodeValue, tMap);
}
}
if (newNodeValue !== node.nodeValue) {
node.nodeValue = newNodeValue;
var parent = node.parentNode;
parent.innerHTML = this.replaceAll(parent.innerHTML, iTMap);
if(this.targetLanguage == "zh"){
parent.innerHTML = this.replaceAll(parent.innerHTML, iTMap);
}
else{
if(this.clkTest(node.nodeValue)){
parent.innerHTML = this.replaceAllClk(parent.innerHTML, iTMap);
}
else{
parent.innerHTML = this.replaceAll(parent.innerHTML, iTMap);
}
}
// parent.innerHTML = this.replaceAll(parent.innerHTML, iTMap);
}
} else if (node.nodeType === Node.ELEMENT_NODE && badTags.indexOf(node.tagName) <= -1) {
var innerNodes = node.childNodes;
Expand Down Expand Up @@ -434,6 +469,40 @@ export class ContentScript {
return ' ' + m + ' ';
}
});

if (/^\s*$/.test(newText)) {
return text;
}
return newText;
}


/**
* Returns text replaced with translations (only for CLK Languages) (if any)
* otherwise returns the same text
* @param {string} text - source text
* @param {Object} translationMap - translations for source words
* @returns {string} text - text with translations
*/
replaceAllClk(text, translationMap) {
var rExp = '';
var sortedSourceWords = Object.keys(translationMap)
.sort((w1, w2) => {
return w2.length - w1.length;
});
sortedSourceWords.forEach((sourceWord) => {
rExp += '(' + this.escapeRegExp(sourceWord) + ')|';
});
rExp = rExp.substring(0, rExp.length - 1);
var regExp = new RegExp(rExp, 'gm');
var newText = text.replace(regExp, (m) => {
if (translationMap[m] !== null) {
return ' ' + translationMap[m] + ' ';
} else {
return ' ' + m + ' ';
}
});

if (/^\s*$/.test(newText)) {
return text;
}
Expand Down Expand Up @@ -572,7 +641,7 @@ export class ContentScript {

sendError(message) {
if(message == '')
message = 'Could Not Connect To Translator Service';
message = 'Could not connect to '+this.translator+' Service .\nIt may be temporarily unavailable or you may be experiencing internet connection problems ';

var date = new Date();

Expand All @@ -597,6 +666,15 @@ export class ContentScript {

});
}


clkTest(str){
var clk_main = new RegExp("[\u4E00-\u9FFF]");
var clk_extension = new RegExp("[\u3400-\u4DBF]");
var clk_strokes = new RegExp("[\u31C0-\u31EF]");
var clk_symbols_punctuation = new RegExp("[\u3000-\u303F]");
return (clk_main.test(str)||clk_extension.test(str)||clk_strokes.test(str)||clk_symbols_punctuation.test(str));
}
}

var MTWTranslator = new ContentScript();
Expand Down
3 changes: 2 additions & 1 deletion lib/views/includes/troubleshooting.html
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
<h5 class="text-right">Facing Issues? <a href="https://gitlab.com/aossie/MindTheWord/issues">Please report to us</a> </h5>
<div class="row">
<div class="col-lg-12">
<div class="panel panel-primary">
Expand All @@ -20,7 +21,7 @@ <h5 class="text-muted"><span class="glyphicon glyphicon-exclamation-sign"></span
<div class="list-group-item clearfix" ng-repeat="logMessage in opctrl.logMessages">
<div class="col-lg-7">
<p style="margin: 0">{{logMessage.message}}</p>
<h6 style="color: #888; margin: 0;">({{logMessage.url}})</h6>
<h6 style="color: #888; margin: 0;word-wrap: break-word;">({{logMessage.url}})</h6>
</div>
<div class="col-lg-2">
{{logMessage.date}}
Expand Down

0 comments on commit 0859fd8

Please sign in to comment.