Path 1: 719 calls (0.97)

LineSet (719)

LineSet (719)

1def _find_common(
2        self, lineset1: LineSet, lineset2: LineSet
3    ) -> Generator[Commonality, None, None]:
4        """Find similarities in the two given linesets.
5
6        This the core of the algorithm. The idea is to compute the hashes of a
7        minimal number of successive lines of each lineset and then compare the
8        hashes. Every match of such comparison is stored in a dict that links the
9        couple of starting indices in both linesets to the couple of corresponding
10        starting and ending lines in both files.
11
12        Last regroups all successive couples in a bigger one. It allows to take into
13        account common chunk of lines that have more than the minimal number of
14        successive lines required.
15        """
16        hash_to_index_1: HashToIndex_T
17        hash_to_index_2: HashToIndex_T
18        index_to_lines_1: IndexToLines_T
19        index_to_lines_2: IndexToLines_T
20        hash_to_index_1, index_to_lines_1 = hash_lineset(
21            lineset1, self.namespace.min_similarity_lines
22        )
23        hash_to_index_2, index_to_lines_2 = hash_lineset(
24            lineset2, self.namespace.min_similarity_lines
25        )
26
27        hash_1: frozenset[LinesChunk] = frozenset(hash_to_index_1.keys())
28        hash_2: frozenset[LinesChunk] = frozenset(hash_to_index_2.keys())
29
30        common_hashes: Iterable[LinesChunk] = sorted(
31            hash_1 & hash_2, key=lambda m: hash_to_index_1[m][0]
32        )
33
34        # all_couples is a dict that links the couple of indices in both linesets that mark the beginning of
35        # successive common lines, to the corresponding starting and ending number lines in both files
36        all_couples: CplIndexToCplLines_T = {}
37
38        for c_hash in sorted(common_hashes, key=operator.attrgetter("_index")):
39            for indices_in_linesets in itertools.product(
40                hash_to_index_1[c_hash], hash_to_index_2[c_hash]
41            ):
42                index_1 = indices_in_linesets[0]
43                index_2 = indices_in_linesets[1]
44                all_couples[
45                    LineSetStartCouple(index_1, index_2)
46                ] = CplSuccessiveLinesLimits(
47                    copy.copy(index_to_lines_1[index_1]),
48                    copy.copy(index_to_lines_2[index_2]),
49                    effective_cmn_lines_nb=self.namespace.min_similarity_lines,
50                )
51
52        remove_successive(all_couples)
53
54        for cml_stripped_l, cmn_l in all_couples.items():
55            start_index_1 = cml_stripped_l.fst_lineset_index
56            start_index_2 = cml_stripped_l.snd_lineset_index
57            nb_common_lines = cmn_l.effective_cmn_lines_nb
58
59            com = Commonality(
60                cmn_lines_nb=nb_common_lines,
61                fst_lset=lineset1,
62                fst_file_start=cmn_l.first_file.start,
63                fst_file_end=cmn_l.first_file.end,
64                snd_lset=lineset2,
65                snd_file_start=cmn_l.second_file.start,
66                snd_file_end=cmn_l.second_file.end,
67            )
68
69            eff_cmn_nb = filter_noncode_lines(
70                lineset1, start_index_1, lineset2, start_index_2, nb_common_lines
71            )
72
73            if eff_cmn_nb > self.namespace.min_similarity_lines:
74                yield com
            

Path 2: 19 calls (0.03)

LineSet (19)

LineSet (19)

Commonality (23)

1def _find_common(
2        self, lineset1: LineSet, lineset2: LineSet
3    ) -> Generator[Commonality, None, None]:
4        """Find similarities in the two given linesets.
5
6        This the core of the algorithm. The idea is to compute the hashes of a
7        minimal number of successive lines of each lineset and then compare the
8        hashes. Every match of such comparison is stored in a dict that links the
9        couple of starting indices in both linesets to the couple of corresponding
10        starting and ending lines in both files.
11
12        Last regroups all successive couples in a bigger one. It allows to take into
13        account common chunk of lines that have more than the minimal number of
14        successive lines required.
15        """
16        hash_to_index_1: HashToIndex_T
17        hash_to_index_2: HashToIndex_T
18        index_to_lines_1: IndexToLines_T
19        index_to_lines_2: IndexToLines_T
20        hash_to_index_1, index_to_lines_1 = hash_lineset(
21            lineset1, self.namespace.min_similarity_lines
22        )
23        hash_to_index_2, index_to_lines_2 = hash_lineset(
24            lineset2, self.namespace.min_similarity_lines
25        )
26
27        hash_1: frozenset[LinesChunk] = frozenset(hash_to_index_1.keys())
28        hash_2: frozenset[LinesChunk] = frozenset(hash_to_index_2.keys())
29
30        common_hashes: Iterable[LinesChunk] = sorted(
31            hash_1 & hash_2, key=lambda m: hash_to_index_1[m][0]
32        )
33
34        # all_couples is a dict that links the couple of indices in both linesets that mark the beginning of
35        # successive common lines, to the corresponding starting and ending number lines in both files
36        all_couples: CplIndexToCplLines_T = {}
37
38        for c_hash in sorted(common_hashes, key=operator.attrgetter("_index")):
39            for indices_in_linesets in itertools.product(
40                hash_to_index_1[c_hash], hash_to_index_2[c_hash]
41            ):
42                index_1 = indices_in_linesets[0]
43                index_2 = indices_in_linesets[1]
44                all_couples[
45                    LineSetStartCouple(index_1, index_2)
46                ] = CplSuccessiveLinesLimits(
47                    copy.copy(index_to_lines_1[index_1]),
48                    copy.copy(index_to_lines_2[index_2]),
49                    effective_cmn_lines_nb=self.namespace.min_similarity_lines,
50                )
51
52        remove_successive(all_couples)
53
54        for cml_stripped_l, cmn_l in all_couples.items():
55            start_index_1 = cml_stripped_l.fst_lineset_index
56            start_index_2 = cml_stripped_l.snd_lineset_index
57            nb_common_lines = cmn_l.effective_cmn_lines_nb
58
59            com = Commonality(
60                cmn_lines_nb=nb_common_lines,
61                fst_lset=lineset1,
62                fst_file_start=cmn_l.first_file.start,
63                fst_file_end=cmn_l.first_file.end,
64                snd_lset=lineset2,
65                snd_file_start=cmn_l.second_file.start,
66                snd_file_end=cmn_l.second_file.end,
67            )
68
69            eff_cmn_nb = filter_noncode_lines(
70                lineset1, start_index_1, lineset2, start_index_2, nb_common_lines
71            )
72
73            if eff_cmn_nb > self.namespace.min_similarity_lines:
74                yield com
            

Path 3: 2 calls (0.0)

LineSet (2)

LineSet (2)

1def _find_common(
2        self, lineset1: LineSet, lineset2: LineSet
3    ) -> Generator[Commonality, None, None]:
4        """Find similarities in the two given linesets.
5
6        This the core of the algorithm. The idea is to compute the hashes of a
7        minimal number of successive lines of each lineset and then compare the
8        hashes. Every match of such comparison is stored in a dict that links the
9        couple of starting indices in both linesets to the couple of corresponding
10        starting and ending lines in both files.
11
12        Last regroups all successive couples in a bigger one. It allows to take into
13        account common chunk of lines that have more than the minimal number of
14        successive lines required.
15        """
16        hash_to_index_1: HashToIndex_T
17        hash_to_index_2: HashToIndex_T
18        index_to_lines_1: IndexToLines_T
19        index_to_lines_2: IndexToLines_T
20        hash_to_index_1, index_to_lines_1 = hash_lineset(
21            lineset1, self.namespace.min_similarity_lines
22        )
23        hash_to_index_2, index_to_lines_2 = hash_lineset(
24            lineset2, self.namespace.min_similarity_lines
25        )
26
27        hash_1: frozenset[LinesChunk] = frozenset(hash_to_index_1.keys())
28        hash_2: frozenset[LinesChunk] = frozenset(hash_to_index_2.keys())
29
30        common_hashes: Iterable[LinesChunk] = sorted(
31            hash_1 & hash_2, key=lambda m: hash_to_index_1[m][0]
32        )
33
34        # all_couples is a dict that links the couple of indices in both linesets that mark the beginning of
35        # successive common lines, to the corresponding starting and ending number lines in both files
36        all_couples: CplIndexToCplLines_T = {}
37
38        for c_hash in sorted(common_hashes, key=operator.attrgetter("_index")):
39            for indices_in_linesets in itertools.product(
40                hash_to_index_1[c_hash], hash_to_index_2[c_hash]
41            ):
42                index_1 = indices_in_linesets[0]
43                index_2 = indices_in_linesets[1]
44                all_couples[
45                    LineSetStartCouple(index_1, index_2)
46                ] = CplSuccessiveLinesLimits(
47                    copy.copy(index_to_lines_1[index_1]),
48                    copy.copy(index_to_lines_2[index_2]),
49                    effective_cmn_lines_nb=self.namespace.min_similarity_lines,
50                )
51
52        remove_successive(all_couples)
53
54        for cml_stripped_l, cmn_l in all_couples.items():
55            start_index_1 = cml_stripped_l.fst_lineset_index
56            start_index_2 = cml_stripped_l.snd_lineset_index
57            nb_common_lines = cmn_l.effective_cmn_lines_nb
58
59            com = Commonality(
60                cmn_lines_nb=nb_common_lines,
61                fst_lset=lineset1,
62                fst_file_start=cmn_l.first_file.start,
63                fst_file_end=cmn_l.first_file.end,
64                snd_lset=lineset2,
65                snd_file_start=cmn_l.second_file.start,
66                snd_file_end=cmn_l.second_file.end,
67            )
68
69            eff_cmn_nb = filter_noncode_lines(
70                lineset1, start_index_1, lineset2, start_index_2, nb_common_lines
71            )
72
73            if eff_cmn_nb > self.namespace.min_similarity_lines:
74                yield com