Method: csv.Sniffer._guess_delimiter
Calls: 14, Exceptions: 0, Paths: 5Back
Path 1: 4 calls (0.29)
"Harry's, Arlington Heights, IL, 2/1/03, Kimi Hayes\nShark City, Glendale Heights, IL, 12/28/02, Prezence\nTommy's Place, Blue Island, IL, 12/28/02, B...
None (4)
(',', True) (2) (';', False) (1) ('\t', False) (1)
1def _guess_delimiter(self, data, delimiters):
2 """
3 The delimiter /should/ occur the same number of times on
4 each row. However, due to malformed data, it may not. We don't want
5 an all or nothing approach, so we allow for small variations in this
6 number.
7 1) build a table of the frequency of each character on every line.
8 2) build a table of frequencies of this frequency (meta-frequency?),
9 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
10 7 times in 2 rows'
11 3) use the mode of the meta-frequency to determine the /expected/
12 frequency for that character
13 4) find out how often the character actually meets that goal
14 5) the character that best meets its goal is the delimiter
15 For performance reasons, the data is evaluated in chunks, so it can
16 try and evaluate the smallest portion of the data possible, evaluating
17 additional chunks as necessary.
18 """
19
20 data = list(filter(None, data.split('\n')))
21
22 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
23
24 # build frequency tables
25 chunkLength = min(10, len(data))
26 iteration = 0
27 charFrequency = {}
28 modes = {}
29 delims = {}
30 start, end = 0, chunkLength
31 while start < len(data):
32 iteration += 1
33 for line in data[start:end]:
34 for char in ascii:
35 metaFrequency = charFrequency.get(char, {})
36 # must count even if frequency is 0
37 freq = line.count(char)
38 # value is the mode
39 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
40 charFrequency[char] = metaFrequency
41
42 for char in charFrequency.keys():
43 items = list(charFrequency[char].items())
44 if len(items) == 1 and items[0][0] == 0:
45 continue
46 # get the mode of the frequencies
47 if len(items) > 1:
48 modes[char] = max(items, key=lambda x: x[1])
49 # adjust the mode - subtract the sum of all
50 # other frequencies
51 items.remove(modes[char])
52 modes[char] = (modes[char][0], modes[char][1]
53 - sum(item[1] for item in items))
54 else:
55 modes[char] = items[0]
56
57 # build a list of possible delimiters
58 modeList = modes.items()
59 total = float(min(chunkLength * iteration, len(data)))
60 # (rows of consistent data) / (number of rows) = 100%
61 consistency = 1.0
62 # minimum consistency threshold
63 threshold = 0.9
64 while len(delims) == 0 and consistency >= threshold:
65 for k, v in modeList:
66 if v[0] > 0 and v[1] > 0:
67 if ((v[1]/total) >= consistency and
68 (delimiters is None or k in delimiters)):
69 delims[k] = v
70 consistency -= 0.01
71
72 if len(delims) == 1:
73 delim = list(delims.keys())[0]
74 skipinitialspace = (data[0].count(delim) ==
75 data[0].count("%c " % delim))
76 return (delim, skipinitialspace)
77
78 # analyze another chunkLength lines
79 start = end
80 end += chunkLength
81
82 if not delims:
83 return ('', 0)
84
85 # if there's more than one, fall back to a 'preferred' list
86 if len(delims) > 1:
87 for d in self.preferred:
88 if d in delims.keys():
89 skipinitialspace = (data[0].count(d) ==
90 data[0].count("%c " % d))
91 return (d, skipinitialspace)
92
93 # nothing else indicates a preference, pick the character that
94 # dominates(?)
95 items = [(v,k) for (k,v) in delims.items()]
96 items.sort()
97 delim = items[-1][1]
98
99 skipinitialspace = (data[0].count(delim) ==
100 data[0].count("%c " % delim))
101 return (delim, skipinitialspace)
Path 2: 4 calls (0.29)
"Harry's+ Arlington Heights+ IL+ 2/1/03+ Kimi Hayes\nShark City+ Glendale Heights+ IL+ 12/28/02+ Prezence\nTommy's Place+ Blue Island+ IL+ 12/28/02+ B...
None (4)
('+', True) (3) ('|', False) (1)
1def _guess_delimiter(self, data, delimiters):
2 """
3 The delimiter /should/ occur the same number of times on
4 each row. However, due to malformed data, it may not. We don't want
5 an all or nothing approach, so we allow for small variations in this
6 number.
7 1) build a table of the frequency of each character on every line.
8 2) build a table of frequencies of this frequency (meta-frequency?),
9 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
10 7 times in 2 rows'
11 3) use the mode of the meta-frequency to determine the /expected/
12 frequency for that character
13 4) find out how often the character actually meets that goal
14 5) the character that best meets its goal is the delimiter
15 For performance reasons, the data is evaluated in chunks, so it can
16 try and evaluate the smallest portion of the data possible, evaluating
17 additional chunks as necessary.
18 """
19
20 data = list(filter(None, data.split('\n')))
21
22 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
23
24 # build frequency tables
25 chunkLength = min(10, len(data))
26 iteration = 0
27 charFrequency = {}
28 modes = {}
29 delims = {}
30 start, end = 0, chunkLength
31 while start < len(data):
32 iteration += 1
33 for line in data[start:end]:
34 for char in ascii:
35 metaFrequency = charFrequency.get(char, {})
36 # must count even if frequency is 0
37 freq = line.count(char)
38 # value is the mode
39 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
40 charFrequency[char] = metaFrequency
41
42 for char in charFrequency.keys():
43 items = list(charFrequency[char].items())
44 if len(items) == 1 and items[0][0] == 0:
45 continue
46 # get the mode of the frequencies
47 if len(items) > 1:
48 modes[char] = max(items, key=lambda x: x[1])
49 # adjust the mode - subtract the sum of all
50 # other frequencies
51 items.remove(modes[char])
52 modes[char] = (modes[char][0], modes[char][1]
53 - sum(item[1] for item in items))
54 else:
55 modes[char] = items[0]
56
57 # build a list of possible delimiters
58 modeList = modes.items()
59 total = float(min(chunkLength * iteration, len(data)))
60 # (rows of consistent data) / (number of rows) = 100%
61 consistency = 1.0
62 # minimum consistency threshold
63 threshold = 0.9
64 while len(delims) == 0 and consistency >= threshold:
65 for k, v in modeList:
66 if v[0] > 0 and v[1] > 0:
67 if ((v[1]/total) >= consistency and
68 (delimiters is None or k in delimiters)):
69 delims[k] = v
70 consistency -= 0.01
71
72 if len(delims) == 1:
73 delim = list(delims.keys())[0]
74 skipinitialspace = (data[0].count(delim) ==
75 data[0].count("%c " % delim))
76 return (delim, skipinitialspace)
77
78 # analyze another chunkLength lines
79 start = end
80 end += chunkLength
81
82 if not delims:
83 return ('', 0)
84
85 # if there's more than one, fall back to a 'preferred' list
86 if len(delims) > 1:
87 for d in self.preferred:
88 if d in delims.keys():
89 skipinitialspace = (data[0].count(d) ==
90 data[0].count("%c " % d))
91 return (d, skipinitialspace)
92
93 # nothing else indicates a preference, pick the character that
94 # dominates(?)
95 items = [(v,k) for (k,v) in delims.items()]
96 items.sort()
97 delim = items[-1][1]
98
99 skipinitialspace = (data[0].count(delim) ==
100 data[0].count("%c " % delim))
101 return (delim, skipinitialspace)
Path 3: 3 calls (0.21)
'05/05/03?05/05/03?05/05/03?05/05/03?05/05/03?05/05/03\n05/05/03?05/05/03?05/05/03?05/05/03?05/05/03?05/05/03\n05/05/03?05/05/03?05/05/03?05/05/03?05/...
'?,' (1) '/,' (1) ',;' (1)
('?', False) (1) ('/', False) (1) (';', False) (1)
1def _guess_delimiter(self, data, delimiters):
2 """
3 The delimiter /should/ occur the same number of times on
4 each row. However, due to malformed data, it may not. We don't want
5 an all or nothing approach, so we allow for small variations in this
6 number.
7 1) build a table of the frequency of each character on every line.
8 2) build a table of frequencies of this frequency (meta-frequency?),
9 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
10 7 times in 2 rows'
11 3) use the mode of the meta-frequency to determine the /expected/
12 frequency for that character
13 4) find out how often the character actually meets that goal
14 5) the character that best meets its goal is the delimiter
15 For performance reasons, the data is evaluated in chunks, so it can
16 try and evaluate the smallest portion of the data possible, evaluating
17 additional chunks as necessary.
18 """
19
20 data = list(filter(None, data.split('\n')))
21
22 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
23
24 # build frequency tables
25 chunkLength = min(10, len(data))
26 iteration = 0
27 charFrequency = {}
28 modes = {}
29 delims = {}
30 start, end = 0, chunkLength
31 while start < len(data):
32 iteration += 1
33 for line in data[start:end]:
34 for char in ascii:
35 metaFrequency = charFrequency.get(char, {})
36 # must count even if frequency is 0
37 freq = line.count(char)
38 # value is the mode
39 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
40 charFrequency[char] = metaFrequency
41
42 for char in charFrequency.keys():
43 items = list(charFrequency[char].items())
44 if len(items) == 1 and items[0][0] == 0:
45 continue
46 # get the mode of the frequencies
47 if len(items) > 1:
48 modes[char] = max(items, key=lambda x: x[1])
49 # adjust the mode - subtract the sum of all
50 # other frequencies
51 items.remove(modes[char])
52 modes[char] = (modes[char][0], modes[char][1]
53 - sum(item[1] for item in items))
54 else:
55 modes[char] = items[0]
56
57 # build a list of possible delimiters
58 modeList = modes.items()
59 total = float(min(chunkLength * iteration, len(data)))
60 # (rows of consistent data) / (number of rows) = 100%
61 consistency = 1.0
62 # minimum consistency threshold
63 threshold = 0.9
64 while len(delims) == 0 and consistency >= threshold:
65 for k, v in modeList:
66 if v[0] > 0 and v[1] > 0:
67 if ((v[1]/total) >= consistency and
68 (delimiters is None or k in delimiters)):
69 delims[k] = v
70 consistency -= 0.01
71
72 if len(delims) == 1:
73 delim = list(delims.keys())[0]
74 skipinitialspace = (data[0].count(delim) ==
75 data[0].count("%c " % delim))
76 return (delim, skipinitialspace)
77
78 # analyze another chunkLength lines
79 start = end
80 end += chunkLength
81
82 if not delims:
83 return ('', 0)
84
85 # if there's more than one, fall back to a 'preferred' list
86 if len(delims) > 1:
87 for d in self.preferred:
88 if d in delims.keys():
89 skipinitialspace = (data[0].count(d) ==
90 data[0].count("%c " % d))
91 return (d, skipinitialspace)
92
93 # nothing else indicates a preference, pick the character that
94 # dominates(?)
95 items = [(v,k) for (k,v) in delims.items()]
96 items.sort()
97 delim = items[-1][1]
98
99 skipinitialspace = (data[0].count(delim) ==
100 data[0].count("%c " % delim))
101 return (delim, skipinitialspace)
Path 4: 2 calls (0.14)
'\nabc,def\nghijkl,mno\nghi,jkl\n' (1) '\nabc,def\nghijkl,mnop\nghi,jkl\n' (1)
None (2)
(',', False) (2)
1def _guess_delimiter(self, data, delimiters):
2 """
3 The delimiter /should/ occur the same number of times on
4 each row. However, due to malformed data, it may not. We don't want
5 an all or nothing approach, so we allow for small variations in this
6 number.
7 1) build a table of the frequency of each character on every line.
8 2) build a table of frequencies of this frequency (meta-frequency?),
9 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
10 7 times in 2 rows'
11 3) use the mode of the meta-frequency to determine the /expected/
12 frequency for that character
13 4) find out how often the character actually meets that goal
14 5) the character that best meets its goal is the delimiter
15 For performance reasons, the data is evaluated in chunks, so it can
16 try and evaluate the smallest portion of the data possible, evaluating
17 additional chunks as necessary.
18 """
19
20 data = list(filter(None, data.split('\n')))
21
22 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
23
24 # build frequency tables
25 chunkLength = min(10, len(data))
26 iteration = 0
27 charFrequency = {}
28 modes = {}
29 delims = {}
30 start, end = 0, chunkLength
31 while start < len(data):
32 iteration += 1
33 for line in data[start:end]:
34 for char in ascii:
35 metaFrequency = charFrequency.get(char, {})
36 # must count even if frequency is 0
37 freq = line.count(char)
38 # value is the mode
39 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
40 charFrequency[char] = metaFrequency
41
42 for char in charFrequency.keys():
43 items = list(charFrequency[char].items())
44 if len(items) == 1 and items[0][0] == 0:
45 continue
46 # get the mode of the frequencies
47 if len(items) > 1:
48 modes[char] = max(items, key=lambda x: x[1])
49 # adjust the mode - subtract the sum of all
50 # other frequencies
51 items.remove(modes[char])
52 modes[char] = (modes[char][0], modes[char][1]
53 - sum(item[1] for item in items))
54 else:
55 modes[char] = items[0]
56
57 # build a list of possible delimiters
58 modeList = modes.items()
59 total = float(min(chunkLength * iteration, len(data)))
60 # (rows of consistent data) / (number of rows) = 100%
61 consistency = 1.0
62 # minimum consistency threshold
63 threshold = 0.9
64 while len(delims) == 0 and consistency >= threshold:
65 for k, v in modeList:
66 if v[0] > 0 and v[1] > 0:
67 if ((v[1]/total) >= consistency and
68 (delimiters is None or k in delimiters)):
69 delims[k] = v
70 consistency -= 0.01
71
72 if len(delims) == 1:
73 delim = list(delims.keys())[0]
74 skipinitialspace = (data[0].count(delim) ==
75 data[0].count("%c " % delim))
76 return (delim, skipinitialspace)
77
78 # analyze another chunkLength lines
79 start = end
80 end += chunkLength
81
82 if not delims:
83 return ('', 0)
84
85 # if there's more than one, fall back to a 'preferred' list
86 if len(delims) > 1:
87 for d in self.preferred:
88 if d in delims.keys():
89 skipinitialspace = (data[0].count(d) ==
90 data[0].count("%c " % d))
91 return (d, skipinitialspace)
92
93 # nothing else indicates a preference, pick the character that
94 # dominates(?)
95 items = [(v,k) for (k,v) in delims.items()]
96 items.sort()
97 delim = items[-1][1]
98
99 skipinitialspace = (data[0].count(delim) ==
100 data[0].count("%c " % delim))
101 return (delim, skipinitialspace)
Path 5: 1 calls (0.07)
'05/05/03?05/05/03?05/05/03?05/05/03?05/05/03?05/05/03\n05/05/03?05/05/03?05/05/03?05/05/03?05/05/03?05/05/03\n05/05/03?05/05/03?05/05/03?05/05/03?05/...
None (1)
('0', False) (1)
1def _guess_delimiter(self, data, delimiters):
2 """
3 The delimiter /should/ occur the same number of times on
4 each row. However, due to malformed data, it may not. We don't want
5 an all or nothing approach, so we allow for small variations in this
6 number.
7 1) build a table of the frequency of each character on every line.
8 2) build a table of frequencies of this frequency (meta-frequency?),
9 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
10 7 times in 2 rows'
11 3) use the mode of the meta-frequency to determine the /expected/
12 frequency for that character
13 4) find out how often the character actually meets that goal
14 5) the character that best meets its goal is the delimiter
15 For performance reasons, the data is evaluated in chunks, so it can
16 try and evaluate the smallest portion of the data possible, evaluating
17 additional chunks as necessary.
18 """
19
20 data = list(filter(None, data.split('\n')))
21
22 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
23
24 # build frequency tables
25 chunkLength = min(10, len(data))
26 iteration = 0
27 charFrequency = {}
28 modes = {}
29 delims = {}
30 start, end = 0, chunkLength
31 while start < len(data):
32 iteration += 1
33 for line in data[start:end]:
34 for char in ascii:
35 metaFrequency = charFrequency.get(char, {})
36 # must count even if frequency is 0
37 freq = line.count(char)
38 # value is the mode
39 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
40 charFrequency[char] = metaFrequency
41
42 for char in charFrequency.keys():
43 items = list(charFrequency[char].items())
44 if len(items) == 1 and items[0][0] == 0:
45 continue
46 # get the mode of the frequencies
47 if len(items) > 1:
48 modes[char] = max(items, key=lambda x: x[1])
49 # adjust the mode - subtract the sum of all
50 # other frequencies
51 items.remove(modes[char])
52 modes[char] = (modes[char][0], modes[char][1]
53 - sum(item[1] for item in items))
54 else:
55 modes[char] = items[0]
56
57 # build a list of possible delimiters
58 modeList = modes.items()
59 total = float(min(chunkLength * iteration, len(data)))
60 # (rows of consistent data) / (number of rows) = 100%
61 consistency = 1.0
62 # minimum consistency threshold
63 threshold = 0.9
64 while len(delims) == 0 and consistency >= threshold:
65 for k, v in modeList:
66 if v[0] > 0 and v[1] > 0:
67 if ((v[1]/total) >= consistency and
68 (delimiters is None or k in delimiters)):
69 delims[k] = v
70 consistency -= 0.01
71
72 if len(delims) == 1:
73 delim = list(delims.keys())[0]
74 skipinitialspace = (data[0].count(delim) ==
75 data[0].count("%c " % delim))
76 return (delim, skipinitialspace)
77
78 # analyze another chunkLength lines
79 start = end
80 end += chunkLength
81
82 if not delims:
83 return ('', 0)
84
85 # if there's more than one, fall back to a 'preferred' list
86 if len(delims) > 1:
87 for d in self.preferred:
88 if d in delims.keys():
89 skipinitialspace = (data[0].count(d) ==
90 data[0].count("%c " % d))
91 return (d, skipinitialspace)
92
93 # nothing else indicates a preference, pick the character that
94 # dominates(?)
95 items = [(v,k) for (k,v) in delims.items()]
96 items.sort()
97 delim = items[-1][1]
98
99 skipinitialspace = (data[0].count(delim) ==
100 data[0].count("%c " % delim))
101 return (delim, skipinitialspace)