csv.Sniffer._guess

Path 1: 4 calls (0.29)

"Harry's, Arlington Heights, IL, 2/1/03, Kimi Hayes\nShark City, Glendale Heights, IL, 12/28/02, Prezence\nTommy's Place, Blue Island, IL, 12/28/02, B...

None (4)

(',', True) (2) (';', False) (1) ('\t', False) (1)

1def _guess_delimiter(self, data, delimiters):
2        """
3        The delimiter /should/ occur the same number of times on
4        each row. However, due to malformed data, it may not. We don't want
5        an all or nothing approach, so we allow for small variations in this
6        number.
7          1) build a table of the frequency of each character on every line.
8          2) build a table of frequencies of this frequency (meta-frequency?),
9             e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows,
10             7 times in 2 rows'
11          3) use the mode of the meta-frequency to determine the /expected/
12             frequency for that character
13          4) find out how often the character actually meets that goal
14          5) the character that best meets its goal is the delimiter
15        For performance reasons, the data is evaluated in chunks, so it can
16        try and evaluate the smallest portion of the data possible, evaluating
17        additional chunks as necessary.
18        """
19
20        data = list(filter(None, data.split('\n')))
21
22        ascii = [chr(c) for c in range(127)] # 7-bit ASCII
23
24        # build frequency tables
25        chunkLength = min(10, len(data))
26        iteration = 0
27        charFrequency = {}
28        modes = {}
29        delims = {}
30        start, end = 0, chunkLength
31        while start < len(data):
32            iteration += 1
33            for line in data[start:end]:
34                for char in ascii:
35                    metaFrequency = charFrequency.get(char, {})
36                    # must count even if frequency is 0
37                    freq = line.count(char)
38                    # value is the mode
39                    metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
40                    charFrequency[char] = metaFrequency
41
42            for char in charFrequency.keys():
43                items = list(charFrequency[char].items())
44                if len(items) == 1 and items[0][0] == 0:
45                    continue
46                # get the mode of the frequencies
47                if len(items) > 1:
48                    modes[char] = max(items, key=lambda x: x[1])
49                    # adjust the mode - subtract the sum of all
50                    # other frequencies
51                    items.remove(modes[char])
52                    modes[char] = (modes[char][0], modes[char][1]
53                                   - sum(item[1] for item in items))
54                else:
55                    modes[char] = items[0]
56
57            # build a list of possible delimiters
58            modeList = modes.items()
59            total = float(min(chunkLength * iteration, len(data)))
60            # (rows of consistent data) / (number of rows) = 100%
61            consistency = 1.0
62            # minimum consistency threshold
63            threshold = 0.9
64            while len(delims) == 0 and consistency >= threshold:
65                for k, v in modeList:
66                    if v[0] > 0 and v[1] > 0:
67                        if ((v[1]/total) >= consistency and
68                            (delimiters is None or k in delimiters)):
69                            delims[k] = v
70                consistency -= 0.01
71
72            if len(delims) == 1:
73                delim = list(delims.keys())[0]
74                skipinitialspace = (data[0].count(delim) ==
75                                    data[0].count("%c " % delim))
76                return (delim, skipinitialspace)
77
78            # analyze another chunkLength lines
79            start = end
80            end += chunkLength
81
82        if not delims:
83            return ('', 0)
84
85        # if there's more than one, fall back to a 'preferred' list
86        if len(delims) > 1:
87            for d in self.preferred:
88                if d in delims.keys():
89                    skipinitialspace = (data[0].count(d) ==
90                                        data[0].count("%c " % d))
91                    return (d, skipinitialspace)
92
93        # nothing else indicates a preference, pick the character that
94        # dominates(?)
95        items = [(v,k) for (k,v) in delims.items()]
96        items.sort()
97        delim = items[-1][1]
98
99        skipinitialspace = (data[0].count(delim) ==
100                            data[0].count("%c " % delim))
101        return (delim, skipinitialspace)

Path 2: 4 calls (0.29)

"Harry's+ Arlington Heights+ IL+ 2/1/03+ Kimi Hayes\nShark City+ Glendale Heights+ IL+ 12/28/02+ Prezence\nTommy's Place+ Blue Island+ IL+ 12/28/02+ B...

None (4)

('+', True) (3) ('|', False) (1)

1def _guess_delimiter(self, data, delimiters):
2        """
3        The delimiter /should/ occur the same number of times on
4        each row. However, due to malformed data, it may not. We don't want
5        an all or nothing approach, so we allow for small variations in this
6        number.
7          1) build a table of the frequency of each character on every line.
8          2) build a table of frequencies of this frequency (meta-frequency?),
9             e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows,
10             7 times in 2 rows'
11          3) use the mode of the meta-frequency to determine the /expected/
12             frequency for that character
13          4) find out how often the character actually meets that goal
14          5) the character that best meets its goal is the delimiter
15        For performance reasons, the data is evaluated in chunks, so it can
16        try and evaluate the smallest portion of the data possible, evaluating
17        additional chunks as necessary.
18        """
19
20        data = list(filter(None, data.split('\n')))
21
22        ascii = [chr(c) for c in range(127)] # 7-bit ASCII
23
24        # build frequency tables
25        chunkLength = min(10, len(data))
26        iteration = 0
27        charFrequency = {}
28        modes = {}
29        delims = {}
30        start, end = 0, chunkLength
31        while start < len(data):
32            iteration += 1
33            for line in data[start:end]:
34                for char in ascii:
35                    metaFrequency = charFrequency.get(char, {})
36                    # must count even if frequency is 0
37                    freq = line.count(char)
38                    # value is the mode
39                    metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
40                    charFrequency[char] = metaFrequency
41
42            for char in charFrequency.keys():
43                items = list(charFrequency[char].items())
44                if len(items) == 1 and items[0][0] == 0:
45                    continue
46                # get the mode of the frequencies
47                if len(items) > 1:
48                    modes[char] = max(items, key=lambda x: x[1])
49                    # adjust the mode - subtract the sum of all
50                    # other frequencies
51                    items.remove(modes[char])
52                    modes[char] = (modes[char][0], modes[char][1]
53                                   - sum(item[1] for item in items))
54                else:
55                    modes[char] = items[0]
56
57            # build a list of possible delimiters
58            modeList = modes.items()
59            total = float(min(chunkLength * iteration, len(data)))
60            # (rows of consistent data) / (number of rows) = 100%
61            consistency = 1.0
62            # minimum consistency threshold
63            threshold = 0.9
64            while len(delims) == 0 and consistency >= threshold:
65                for k, v in modeList:
66                    if v[0] > 0 and v[1] > 0:
67                        if ((v[1]/total) >= consistency and
68                            (delimiters is None or k in delimiters)):
69                            delims[k] = v
70                consistency -= 0.01
71
72            if len(delims) == 1:
73                delim = list(delims.keys())[0]
74                skipinitialspace = (data[0].count(delim) ==
75                                    data[0].count("%c " % delim))
76                return (delim, skipinitialspace)
77
78            # analyze another chunkLength lines
79            start = end
80            end += chunkLength
81
82        if not delims:
83            return ('', 0)
84
85        # if there's more than one, fall back to a 'preferred' list
86        if len(delims) > 1:
87            for d in self.preferred:
88                if d in delims.keys():
89                    skipinitialspace = (data[0].count(d) ==
90                                        data[0].count("%c " % d))
91                    return (d, skipinitialspace)
92
93        # nothing else indicates a preference, pick the character that
94        # dominates(?)
95        items = [(v,k) for (k,v) in delims.items()]
96        items.sort()
97        delim = items[-1][1]
98
99        skipinitialspace = (data[0].count(delim) ==
100                            data[0].count("%c " % delim))
101        return (delim, skipinitialspace)

Path 3: 3 calls (0.21)

'05/05/03?05/05/03?05/05/03?05/05/03?05/05/03?05/05/03\n05/05/03?05/05/03?05/05/03?05/05/03?05/05/03?05/05/03\n05/05/03?05/05/03?05/05/03?05/05/03?05/...

'?,' (1) '/,' (1) ',;' (1)

('?', False) (1) ('/', False) (1) (';', False) (1)

1def _guess_delimiter(self, data, delimiters):
2        """
3        The delimiter /should/ occur the same number of times on
4        each row. However, due to malformed data, it may not. We don't want
5        an all or nothing approach, so we allow for small variations in this
6        number.
7          1) build a table of the frequency of each character on every line.
8          2) build a table of frequencies of this frequency (meta-frequency?),
9             e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows,
10             7 times in 2 rows'
11          3) use the mode of the meta-frequency to determine the /expected/
12             frequency for that character
13          4) find out how often the character actually meets that goal
14          5) the character that best meets its goal is the delimiter
15        For performance reasons, the data is evaluated in chunks, so it can
16        try and evaluate the smallest portion of the data possible, evaluating
17        additional chunks as necessary.
18        """
19
20        data = list(filter(None, data.split('\n')))
21
22        ascii = [chr(c) for c in range(127)] # 7-bit ASCII
23
24        # build frequency tables
25        chunkLength = min(10, len(data))
26        iteration = 0
27        charFrequency = {}
28        modes = {}
29        delims = {}
30        start, end = 0, chunkLength
31        while start < len(data):
32            iteration += 1
33            for line in data[start:end]:
34                for char in ascii:
35                    metaFrequency = charFrequency.get(char, {})
36                    # must count even if frequency is 0
37                    freq = line.count(char)
38                    # value is the mode
39                    metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
40                    charFrequency[char] = metaFrequency
41
42            for char in charFrequency.keys():
43                items = list(charFrequency[char].items())
44                if len(items) == 1 and items[0][0] == 0:
45                    continue
46                # get the mode of the frequencies
47                if len(items) > 1:
48                    modes[char] = max(items, key=lambda x: x[1])
49                    # adjust the mode - subtract the sum of all
50                    # other frequencies
51                    items.remove(modes[char])
52                    modes[char] = (modes[char][0], modes[char][1]
53                                   - sum(item[1] for item in items))
54                else:
55                    modes[char] = items[0]
56
57            # build a list of possible delimiters
58            modeList = modes.items()
59            total = float(min(chunkLength * iteration, len(data)))
60            # (rows of consistent data) / (number of rows) = 100%
61            consistency = 1.0
62            # minimum consistency threshold
63            threshold = 0.9
64            while len(delims) == 0 and consistency >= threshold:
65                for k, v in modeList:
66                    if v[0] > 0 and v[1] > 0:
67                        if ((v[1]/total) >= consistency and
68                            (delimiters is None or k in delimiters)):
69                            delims[k] = v
70                consistency -= 0.01
71
72            if len(delims) == 1:
73                delim = list(delims.keys())[0]
74                skipinitialspace = (data[0].count(delim) ==
75                                    data[0].count("%c " % delim))
76                return (delim, skipinitialspace)
77
78            # analyze another chunkLength lines
79            start = end
80            end += chunkLength
81
82        if not delims:
83            return ('', 0)
84
85        # if there's more than one, fall back to a 'preferred' list
86        if len(delims) > 1:
87            for d in self.preferred:
88                if d in delims.keys():
89                    skipinitialspace = (data[0].count(d) ==
90                                        data[0].count("%c " % d))
91                    return (d, skipinitialspace)
92
93        # nothing else indicates a preference, pick the character that
94        # dominates(?)
95        items = [(v,k) for (k,v) in delims.items()]
96        items.sort()
97        delim = items[-1][1]
98
99        skipinitialspace = (data[0].count(delim) ==
100                            data[0].count("%c " % delim))
101        return (delim, skipinitialspace)

Path 4: 2 calls (0.14)

'\nabc,def\nghijkl,mno\nghi,jkl\n' (1) '\nabc,def\nghijkl,mnop\nghi,jkl\n' (1)

None (2)

(',', False) (2)

1def _guess_delimiter(self, data, delimiters):
2        """
3        The delimiter /should/ occur the same number of times on
4        each row. However, due to malformed data, it may not. We don't want
5        an all or nothing approach, so we allow for small variations in this
6        number.
7          1) build a table of the frequency of each character on every line.
8          2) build a table of frequencies of this frequency (meta-frequency?),
9             e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows,
10             7 times in 2 rows'
11          3) use the mode of the meta-frequency to determine the /expected/
12             frequency for that character
13          4) find out how often the character actually meets that goal
14          5) the character that best meets its goal is the delimiter
15        For performance reasons, the data is evaluated in chunks, so it can
16        try and evaluate the smallest portion of the data possible, evaluating
17        additional chunks as necessary.
18        """
19
20        data = list(filter(None, data.split('\n')))
21
22        ascii = [chr(c) for c in range(127)] # 7-bit ASCII
23
24        # build frequency tables
25        chunkLength = min(10, len(data))
26        iteration = 0
27        charFrequency = {}
28        modes = {}
29        delims = {}
30        start, end = 0, chunkLength
31        while start < len(data):
32            iteration += 1
33            for line in data[start:end]:
34                for char in ascii:
35                    metaFrequency = charFrequency.get(char, {})
36                    # must count even if frequency is 0
37                    freq = line.count(char)
38                    # value is the mode
39                    metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
40                    charFrequency[char] = metaFrequency
41
42            for char in charFrequency.keys():
43                items = list(charFrequency[char].items())
44                if len(items) == 1 and items[0][0] == 0:
45                    continue
46                # get the mode of the frequencies
47                if len(items) > 1:
48                    modes[char] = max(items, key=lambda x: x[1])
49                    # adjust the mode - subtract the sum of all
50                    # other frequencies
51                    items.remove(modes[char])
52                    modes[char] = (modes[char][0], modes[char][1]
53                                   - sum(item[1] for item in items))
54                else:
55                    modes[char] = items[0]
56
57            # build a list of possible delimiters
58            modeList = modes.items()
59            total = float(min(chunkLength * iteration, len(data)))
60            # (rows of consistent data) / (number of rows) = 100%
61            consistency = 1.0
62            # minimum consistency threshold
63            threshold = 0.9
64            while len(delims) == 0 and consistency >= threshold:
65                for k, v in modeList:
66                    if v[0] > 0 and v[1] > 0:
67                        if ((v[1]/total) >= consistency and
68                            (delimiters is None or k in delimiters)):
69                            delims[k] = v
70                consistency -= 0.01
71
72            if len(delims) == 1:
73                delim = list(delims.keys())[0]
74                skipinitialspace = (data[0].count(delim) ==
75                                    data[0].count("%c " % delim))
76                return (delim, skipinitialspace)
77
78            # analyze another chunkLength lines
79            start = end
80            end += chunkLength
81
82        if not delims:
83            return ('', 0)
84
85        # if there's more than one, fall back to a 'preferred' list
86        if len(delims) > 1:
87            for d in self.preferred:
88                if d in delims.keys():
89                    skipinitialspace = (data[0].count(d) ==
90                                        data[0].count("%c " % d))
91                    return (d, skipinitialspace)
92
93        # nothing else indicates a preference, pick the character that
94        # dominates(?)
95        items = [(v,k) for (k,v) in delims.items()]
96        items.sort()
97        delim = items[-1][1]
98
99        skipinitialspace = (data[0].count(delim) ==
100                            data[0].count("%c " % delim))
101        return (delim, skipinitialspace)

Path 5: 1 calls (0.07)

'05/05/03?05/05/03?05/05/03?05/05/03?05/05/03?05/05/03\n05/05/03?05/05/03?05/05/03?05/05/03?05/05/03?05/05/03\n05/05/03?05/05/03?05/05/03?05/05/03?05/...

None (1)

('0', False) (1)

1def _guess_delimiter(self, data, delimiters):
2        """
3        The delimiter /should/ occur the same number of times on
4        each row. However, due to malformed data, it may not. We don't want
5        an all or nothing approach, so we allow for small variations in this
6        number.
7          1) build a table of the frequency of each character on every line.
8          2) build a table of frequencies of this frequency (meta-frequency?),
9             e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows,
10             7 times in 2 rows'
11          3) use the mode of the meta-frequency to determine the /expected/
12             frequency for that character
13          4) find out how often the character actually meets that goal
14          5) the character that best meets its goal is the delimiter
15        For performance reasons, the data is evaluated in chunks, so it can
16        try and evaluate the smallest portion of the data possible, evaluating
17        additional chunks as necessary.
18        """
19
20        data = list(filter(None, data.split('\n')))
21
22        ascii = [chr(c) for c in range(127)] # 7-bit ASCII
23
24        # build frequency tables
25        chunkLength = min(10, len(data))
26        iteration = 0
27        charFrequency = {}
28        modes = {}
29        delims = {}
30        start, end = 0, chunkLength
31        while start < len(data):
32            iteration += 1
33            for line in data[start:end]:
34                for char in ascii:
35                    metaFrequency = charFrequency.get(char, {})
36                    # must count even if frequency is 0
37                    freq = line.count(char)
38                    # value is the mode
39                    metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
40                    charFrequency[char] = metaFrequency
41
42            for char in charFrequency.keys():
43                items = list(charFrequency[char].items())
44                if len(items) == 1 and items[0][0] == 0:
45                    continue
46                # get the mode of the frequencies
47                if len(items) > 1:
48                    modes[char] = max(items, key=lambda x: x[1])
49                    # adjust the mode - subtract the sum of all
50                    # other frequencies
51                    items.remove(modes[char])
52                    modes[char] = (modes[char][0], modes[char][1]
53                                   - sum(item[1] for item in items))
54                else:
55                    modes[char] = items[0]
56
57            # build a list of possible delimiters
58            modeList = modes.items()
59            total = float(min(chunkLength * iteration, len(data)))
60            # (rows of consistent data) / (number of rows) = 100%
61            consistency = 1.0
62            # minimum consistency threshold
63            threshold = 0.9
64            while len(delims) == 0 and consistency >= threshold:
65                for k, v in modeList:
66                    if v[0] > 0 and v[1] > 0:
67                        if ((v[1]/total) >= consistency and
68                            (delimiters is None or k in delimiters)):
69                            delims[k] = v
70                consistency -= 0.01
71
72            if len(delims) == 1:
73                delim = list(delims.keys())[0]
74                skipinitialspace = (data[0].count(delim) ==
75                                    data[0].count("%c " % delim))
76                return (delim, skipinitialspace)
77
78            # analyze another chunkLength lines
79            start = end
80            end += chunkLength
81
82        if not delims:
83            return ('', 0)
84
85        # if there's more than one, fall back to a 'preferred' list
86        if len(delims) > 1:
87            for d in self.preferred:
88                if d in delims.keys():
89                    skipinitialspace = (data[0].count(d) ==
90                                        data[0].count("%c " % d))
91                    return (d, skipinitialspace)
92
93        # nothing else indicates a preference, pick the character that
94        # dominates(?)
95        items = [(v,k) for (k,v) in delims.items()]
96        items.sort()
97        delim = items[-1][1]
98
99        skipinitialspace = (data[0].count(delim) ==
100                            data[0].count("%c " % delim))
101        return (delim, skipinitialspace)

Method: csv.Sniffer._guess_delimiter