Method: pylint.checkers.similar.Similar._find_common
Calls: 740, Exceptions: 0, Paths: 3Back
Path 1: 719 calls (0.97)
LineSet (719)
LineSet (719)
1def _find_common(
2 self, lineset1: LineSet, lineset2: LineSet
3 ) -> Generator[Commonality, None, None]:
4 """Find similarities in the two given linesets.
5
6 This the core of the algorithm. The idea is to compute the hashes of a
7 minimal number of successive lines of each lineset and then compare the
8 hashes. Every match of such comparison is stored in a dict that links the
9 couple of starting indices in both linesets to the couple of corresponding
10 starting and ending lines in both files.
11
12 Last regroups all successive couples in a bigger one. It allows to take into
13 account common chunk of lines that have more than the minimal number of
14 successive lines required.
15 """
16 hash_to_index_1: HashToIndex_T
17 hash_to_index_2: HashToIndex_T
18 index_to_lines_1: IndexToLines_T
19 index_to_lines_2: IndexToLines_T
20 hash_to_index_1, index_to_lines_1 = hash_lineset(
21 lineset1, self.namespace.min_similarity_lines
22 )
23 hash_to_index_2, index_to_lines_2 = hash_lineset(
24 lineset2, self.namespace.min_similarity_lines
25 )
26
27 hash_1: frozenset[LinesChunk] = frozenset(hash_to_index_1.keys())
28 hash_2: frozenset[LinesChunk] = frozenset(hash_to_index_2.keys())
29
30 common_hashes: Iterable[LinesChunk] = sorted(
31 hash_1 & hash_2, key=lambda m: hash_to_index_1[m][0]
32 )
33
34 # all_couples is a dict that links the couple of indices in both linesets that mark the beginning of
35 # successive common lines, to the corresponding starting and ending number lines in both files
36 all_couples: CplIndexToCplLines_T = {}
37
38 for c_hash in sorted(common_hashes, key=operator.attrgetter("_index")):
39 for indices_in_linesets in itertools.product(
40 hash_to_index_1[c_hash], hash_to_index_2[c_hash]
41 ):
42 index_1 = indices_in_linesets[0]
43 index_2 = indices_in_linesets[1]
44 all_couples[
45 LineSetStartCouple(index_1, index_2)
46 ] = CplSuccessiveLinesLimits(
47 copy.copy(index_to_lines_1[index_1]),
48 copy.copy(index_to_lines_2[index_2]),
49 effective_cmn_lines_nb=self.namespace.min_similarity_lines,
50 )
51
52 remove_successive(all_couples)
53
54 for cml_stripped_l, cmn_l in all_couples.items():
55 start_index_1 = cml_stripped_l.fst_lineset_index
56 start_index_2 = cml_stripped_l.snd_lineset_index
57 nb_common_lines = cmn_l.effective_cmn_lines_nb
58
59 com = Commonality(
60 cmn_lines_nb=nb_common_lines,
61 fst_lset=lineset1,
62 fst_file_start=cmn_l.first_file.start,
63 fst_file_end=cmn_l.first_file.end,
64 snd_lset=lineset2,
65 snd_file_start=cmn_l.second_file.start,
66 snd_file_end=cmn_l.second_file.end,
67 )
68
69 eff_cmn_nb = filter_noncode_lines(
70 lineset1, start_index_1, lineset2, start_index_2, nb_common_lines
71 )
72
73 if eff_cmn_nb > self.namespace.min_similarity_lines:
74 yield com
Path 2: 19 calls (0.03)
LineSet (19)
LineSet (19)
Commonality (23)
1def _find_common(
2 self, lineset1: LineSet, lineset2: LineSet
3 ) -> Generator[Commonality, None, None]:
4 """Find similarities in the two given linesets.
5
6 This the core of the algorithm. The idea is to compute the hashes of a
7 minimal number of successive lines of each lineset and then compare the
8 hashes. Every match of such comparison is stored in a dict that links the
9 couple of starting indices in both linesets to the couple of corresponding
10 starting and ending lines in both files.
11
12 Last regroups all successive couples in a bigger one. It allows to take into
13 account common chunk of lines that have more than the minimal number of
14 successive lines required.
15 """
16 hash_to_index_1: HashToIndex_T
17 hash_to_index_2: HashToIndex_T
18 index_to_lines_1: IndexToLines_T
19 index_to_lines_2: IndexToLines_T
20 hash_to_index_1, index_to_lines_1 = hash_lineset(
21 lineset1, self.namespace.min_similarity_lines
22 )
23 hash_to_index_2, index_to_lines_2 = hash_lineset(
24 lineset2, self.namespace.min_similarity_lines
25 )
26
27 hash_1: frozenset[LinesChunk] = frozenset(hash_to_index_1.keys())
28 hash_2: frozenset[LinesChunk] = frozenset(hash_to_index_2.keys())
29
30 common_hashes: Iterable[LinesChunk] = sorted(
31 hash_1 & hash_2, key=lambda m: hash_to_index_1[m][0]
32 )
33
34 # all_couples is a dict that links the couple of indices in both linesets that mark the beginning of
35 # successive common lines, to the corresponding starting and ending number lines in both files
36 all_couples: CplIndexToCplLines_T = {}
37
38 for c_hash in sorted(common_hashes, key=operator.attrgetter("_index")):
39 for indices_in_linesets in itertools.product(
40 hash_to_index_1[c_hash], hash_to_index_2[c_hash]
41 ):
42 index_1 = indices_in_linesets[0]
43 index_2 = indices_in_linesets[1]
44 all_couples[
45 LineSetStartCouple(index_1, index_2)
46 ] = CplSuccessiveLinesLimits(
47 copy.copy(index_to_lines_1[index_1]),
48 copy.copy(index_to_lines_2[index_2]),
49 effective_cmn_lines_nb=self.namespace.min_similarity_lines,
50 )
51
52 remove_successive(all_couples)
53
54 for cml_stripped_l, cmn_l in all_couples.items():
55 start_index_1 = cml_stripped_l.fst_lineset_index
56 start_index_2 = cml_stripped_l.snd_lineset_index
57 nb_common_lines = cmn_l.effective_cmn_lines_nb
58
59 com = Commonality(
60 cmn_lines_nb=nb_common_lines,
61 fst_lset=lineset1,
62 fst_file_start=cmn_l.first_file.start,
63 fst_file_end=cmn_l.first_file.end,
64 snd_lset=lineset2,
65 snd_file_start=cmn_l.second_file.start,
66 snd_file_end=cmn_l.second_file.end,
67 )
68
69 eff_cmn_nb = filter_noncode_lines(
70 lineset1, start_index_1, lineset2, start_index_2, nb_common_lines
71 )
72
73 if eff_cmn_nb > self.namespace.min_similarity_lines:
74 yield com
Path 3: 2 calls (0.0)
LineSet (2)
LineSet (2)
1def _find_common(
2 self, lineset1: LineSet, lineset2: LineSet
3 ) -> Generator[Commonality, None, None]:
4 """Find similarities in the two given linesets.
5
6 This the core of the algorithm. The idea is to compute the hashes of a
7 minimal number of successive lines of each lineset and then compare the
8 hashes. Every match of such comparison is stored in a dict that links the
9 couple of starting indices in both linesets to the couple of corresponding
10 starting and ending lines in both files.
11
12 Last regroups all successive couples in a bigger one. It allows to take into
13 account common chunk of lines that have more than the minimal number of
14 successive lines required.
15 """
16 hash_to_index_1: HashToIndex_T
17 hash_to_index_2: HashToIndex_T
18 index_to_lines_1: IndexToLines_T
19 index_to_lines_2: IndexToLines_T
20 hash_to_index_1, index_to_lines_1 = hash_lineset(
21 lineset1, self.namespace.min_similarity_lines
22 )
23 hash_to_index_2, index_to_lines_2 = hash_lineset(
24 lineset2, self.namespace.min_similarity_lines
25 )
26
27 hash_1: frozenset[LinesChunk] = frozenset(hash_to_index_1.keys())
28 hash_2: frozenset[LinesChunk] = frozenset(hash_to_index_2.keys())
29
30 common_hashes: Iterable[LinesChunk] = sorted(
31 hash_1 & hash_2, key=lambda m: hash_to_index_1[m][0]
32 )
33
34 # all_couples is a dict that links the couple of indices in both linesets that mark the beginning of
35 # successive common lines, to the corresponding starting and ending number lines in both files
36 all_couples: CplIndexToCplLines_T = {}
37
38 for c_hash in sorted(common_hashes, key=operator.attrgetter("_index")):
39 for indices_in_linesets in itertools.product(
40 hash_to_index_1[c_hash], hash_to_index_2[c_hash]
41 ):
42 index_1 = indices_in_linesets[0]
43 index_2 = indices_in_linesets[1]
44 all_couples[
45 LineSetStartCouple(index_1, index_2)
46 ] = CplSuccessiveLinesLimits(
47 copy.copy(index_to_lines_1[index_1]),
48 copy.copy(index_to_lines_2[index_2]),
49 effective_cmn_lines_nb=self.namespace.min_similarity_lines,
50 )
51
52 remove_successive(all_couples)
53
54 for cml_stripped_l, cmn_l in all_couples.items():
55 start_index_1 = cml_stripped_l.fst_lineset_index
56 start_index_2 = cml_stripped_l.snd_lineset_index
57 nb_common_lines = cmn_l.effective_cmn_lines_nb
58
59 com = Commonality(
60 cmn_lines_nb=nb_common_lines,
61 fst_lset=lineset1,
62 fst_file_start=cmn_l.first_file.start,
63 fst_file_end=cmn_l.first_file.end,
64 snd_lset=lineset2,
65 snd_file_start=cmn_l.second_file.start,
66 snd_file_end=cmn_l.second_file.end,
67 )
68
69 eff_cmn_nb = filter_noncode_lines(
70 lineset1, start_index_1, lineset2, start_index_2, nb_common_lines
71 )
72
73 if eff_cmn_nb > self.namespace.min_similarity_lines:
74 yield com