Search for different strings between two texts (JavaScript)

0

I need to find different words and small sentences of up to 5 words long (or could be customizable) from one text to another. The differences I want to keep in an array, so that only the words or phrases are saved without repeating (it is checked beforehand if it already exists). At the moment I have managed to find different words for me, but I do not know how to make that now I keep looking for phrases of 2 words (consecutive), in the next iteration of 3 words, 4 ... and so on

    var texto = "Aquí va el el primer texto a analizar e incluso con palabras repetidas repetidas";
    var texto2 = "Aquí va el segundo texto a analizar analizar e incluso con algunas palabras repetidas";

    var palabras = texto.split(" ");
    var palabras2 = texto2.split(" ");

    var diferentes = [];

    var max_palabras = 3;
    var max_busquedas = 50; //hasta que diferentes.length>=50
    var cont_palabras = 0;
    var frase = "";
    var newStr = "";
    var addWord = false;

    do {


        for (var i=0; i<palabras.length; i++) {

            console.log("añade palabra = "+addWord);

            if (texto2.includes(palabras[i]) == false) {

                if (diferentes.includes(palabras[i]) == false) { //si no existe en el array de diferencias
                    diferentes.push(palabras[i]);
                }
            }

            if (cont_palabras < 1) {
                frase = palabras[i];
                console.log("FRASE = "+frase);
            } 

            if (cont_palabras >= 1 && addWord == true) {

                frase = frase + " " + palabras[i+cont_palabras];
                addWord = false;    
                console.log("FRASE = "+frase);

                console.log("Nueva frase construida (cont = "+ (cont_palabras+1) +") = "+ frase);
            }


        }

        cont_palabras++;
        addWord = true;

    } while (cont_palabras <= max_palabras);


    for (var i=0; i<palabras2.length; i++) {

      if (texto.includes(palabras2[i]) == false) {

         if (diferentes.includes(palabras2[i]) == false) { //si no existe en el array de diferencias
            diferentes.push(palabras2[i]);
         }
      }
    }

    for (var j=0; j<diferentes.length; j++) {
       console.log("Elemento "+ j + " diferente en ambos textos = "+diferentes[j])
    }

How can I make each time I pick up one more word from the text1 and analyzing if it exists in the text2, always looking for all the possible sentences (but the words must be consecutive)

PD. I've updated the code again with some progress, but now I do not know how to make the text use 2 in 2 words, 3 in 3 ... etc

    
asked by Norak 09.11.2017 в 11:23
source

2 answers

2

See if this comes close to what you're looking for.

What the function checkDifferences does is find the matching strings between the two texts and get the rest as differences.

If you want to then you could break down the longer differences into smaller elements.

The checkDifferencesByLength function searches the first text for groups of words of a certain length that do not exist in the second.

function checkDifferences(text1, text2){
  if (text1.length && text2.length){
    var words1 = text1.split(' ');
    var words2 = text2.split(' ');
    // Busca la mayor coincidencia
    for(var i=(words1.length > words2.length ? words2.length : words1.length); i > 0; i--){
      for(var j=0; j<=words1.length - i; j++){
        var pattern = words1.slice(j, j+i).join(' ');
        var coincidence = text2.indexOf(pattern);
        if (coincidence >= 0){
          // Coincidencia encontrada
          // Objeto diferencias de los textos anteriores a la coincidencia
          var differencesBefore = checkDifferences(words1.slice(0, j).join(' '),
                                      text2.substring(0, coincidence).trim());
          // Objeto diferencias de los textos posteriores a la coincidencia
          var differencesAfter = checkDifferences(words1.slice(j+i).join(' '),
                                      text2.substring(coincidence + pattern.length).trim());
          // Devuelve diferencias anteriores, posteriores y coincidencia actual
          return{
            differences1: differencesBefore.differences1.concat(differencesAfter.differences1),
            differences2: differencesBefore.differences2.concat(differencesAfter.differences2),
            coincidences: differencesBefore.coincidences.concat([pattern], differencesAfter.coincidences)
          }
        }
      }
    }
  }
  // No se ha encontrado coincidencias
  return {
    differences1: text1.length ? [text1] : [],
    differences2: text2.length ? [text2] : [],
    coincidences: []
  };
}

function checkDifferencesByLength(text1, text2, length){
  var words1 = text1.split(' ');
  if (words1.length < length) return null;
  
  var differences = [];
  for (var i=0; i+length<=words1.length; i++){
    var pattern = words1.slice(i, i+length).join(' ');
    if (text2.indexOf(pattern)<0) differences.push(pattern);
  }
  return differences;
}

var texto = "Aquí va el el primer texto a analizar e incluso con palabras repetidas repetidas";
var texto2 = "Aquí va el segundo texto a analizar analizar e incluso con algunas palabras repetidas";

var result = checkDifferences(texto, texto2);

console.log('Primer texto: ' + texto);
console.log('Segundo texto: ' + texto2);

console.log('Coincidencias:\n' + result.coincidences.join('\n'));
console.log('Elementos diferentes en primer texto:\n' + result.differences1.join('\n'));
console.log('Elementos diferentes en segundo texto:\n' + result.differences2.join('\n'));

var resultByLength = checkDifferencesByLength(texto, texto2, 4);
console.log('Grupos de 4 palabras del primer texto que no existen en el segundo:\n'
  + resultByLength.join('\n'));
resultByLength = checkDifferencesByLength(texto2, texto, 4);
console.log('Grupos de 4 palabras del segundo texto que no existen en el primero:\n'
  + resultByLength.join('\n'));
    
answered by 09.11.2017 / 12:37
source
0

function shorter(max,s1,s2){
var arr1 = s1.split(" "), arr2 = s2.split(" "), i = 0, l1 = arr1.length, unicas = [];
  for(;i<l1;i++){
    if((arr2.indexOf(arr1[i]) === -1) && arr1[i].length <= max) unicas.push(arr1[i]);
  }
  return unicas;
}


var texto = 
"Aquí va el el primer texto a analizar e incluso con palabras repetidas repetidas";
var texto2 = 
"Aquí va el segundo texto a analizar analizar e incluso con algunas palabras repetidas";
console.log(shorter(6,texto,texto2));
    
answered by 09.11.2017 в 13:50