У меня есть очень похожий панд dataframe:Проблемы при попытке извлечь даты из столбца pandas?
В:
import pandas as pd
df = pd.DataFrame({'blobs':['6-Feb- 1 4 Startup ZestFinance says it has built a machine-learning system that’s smart enough to find new borrowers and keep bias out of its credit analysis. 17-Feb-2014',
'Credit ratings have long been the key measure of how likely a U.S. 29—Oct-2012 consumer is to repay any loan, from mortgages to 18-0ct-12 credit cards. But the factors that FICO and other companies that create credit scores rely on—things like credit history and credit card balances—often depend on having credit already. ',
'November 22, 2012 In recent years, a crop of startup companies have launched on the premise 6—Feb- ] 4 that borrowers without such histories might still be quite likely to repay, and that their likelihood of doing so could be determined by analyzing large amounts of data, especially data that has traditionally not been part of the credit evaluation. These companies use algorithms and machine learning to find meaningful patterns in the data, alternative signs that a borrower is a good or bad credit risk.',
'March 1“, 2012 Los Angeles-based ZestFinance, founded by former Google CIO Douglas Merrill, claims to have solved this problem with a new credit-scoring platform, called ZAML. 06—Fcb—2012 The company sells the machine-learning software to lenders and also offers consulting 19—Jan— ] 2 services. Zest does not lend money itself. January 2, 1990']})
Каждая строка имеет некоторые шумные даты в различных форматах. Таким образом, моя цель состоит в том, чтобы извлечь даты в другой колонке, , например, из приведенного выше кадра daata Я хотел бы извлечь даты, как это:
(*) Out:
dates1 dates2
0 6-Feb-14, 17-Feb-2014 NaN
1 29—Oct-2012, 18-0ct-12 NaN
2 6—Feb- ]4 November 22, 2012
3 06—Fcb—2012, 19—Jan— ] 2 March 1“, 2012 | January 2, 1990
Так пока я пытался определить следующее регулярное выражение и выписку из колонки с extract():
в:
df['col1'] = df['blobs'].str.extract(r"^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\1|(?:(?:29|30)(\/|-|\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)(?:0?2|(?:Feb))\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$")
df
Однако я получаю следующую ошибку, и я хотел бы иметь более надежный метод. Поэтому, что является лучшим способом получения (*)
Out:
KeyError Traceback (most recent call last)
/usr/local/lib/python3.5/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
2133 try:
-> 2134 return self._engine.get_loc(key)
2135 except KeyError:
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4433)()
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4279)()
pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13742)()
pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13696)()
KeyError: 'date_format_3'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
/usr/local/lib/python3.5/site-packages/pandas/core/internals.py in set(self, item, value, check)
3667 try:
-> 3668 loc = self.items.get_loc(item)
3669 except KeyError:
/usr/local/lib/python3.5/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
2135 except KeyError:
-> 2136 return self._engine.get_loc(self._maybe_cast_indexer(key))
2137
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4433)()
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4279)()
pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13742)()
pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13696)()
KeyError: 'date_format_3'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-3-ec929aa5341c> in <module>()
----> 1 df['date_format_3'] = df['text'].str.extract(r"^(?:(?:31(\/|-|\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\1|(?:(?:29|30)(\/|-|\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\2))(?:(?:1[6-9]|[2-9]\d)?\d{2})$|^(?:29(\/|-|\.)(?:0?2|(?:Feb))\3(?:(?:(?:1[6-9]|[2-9]\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))$|^(?:0?[1-9]|1\d|2[0-8])(\/|-|\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\4(?:(?:1[6-9]|[2-9]\d)?\d{2})$", expand = True)
2 df
/usr/local/lib/python3.5/site-packages/pandas/core/frame.py in __setitem__(self, key, value)
2417 else:
2418 # set column
-> 2419 self._set_item(key, value)
2420
2421 def _setitem_slice(self, key, value):
/usr/local/lib/python3.5/site-packages/pandas/core/frame.py in _set_item(self, key, value)
2484 self._ensure_valid_index(value)
2485 value = self._sanitize_column(key, value)
-> 2486 NDFrame._set_item(self, key, value)
2487
2488 # check if we are modifying a copy
/usr/local/lib/python3.5/site-packages/pandas/core/generic.py in _set_item(self, key, value)
1498
1499 def _set_item(self, key, value):
-> 1500 self._data.set(key, value)
1501 self._clear_item_cache()
1502
/usr/local/lib/python3.5/site-packages/pandas/core/internals.py in set(self, item, value, check)
3669 except KeyError:
3670 # This item wasn't present, just insert at end
-> 3671 self.insert(len(self.items), item, value)
3672 return
3673
/usr/local/lib/python3.5/site-packages/pandas/core/internals.py in insert(self, loc, item, value, allow_duplicates)
3770
3771 block = make_block(values=value, ndim=self.ndim,
-> 3772 placement=slice(loc, loc + 1))
3773
3774 for blkno, count in _fast_count_smallints(self._blknos[loc:]):
/usr/local/lib/python3.5/site-packages/pandas/core/internals.py in make_block(values, placement, klass, ndim, dtype, fastpath)
2683 placement=placement, dtype=dtype)
2684
-> 2685 return klass(values, ndim=ndim, fastpath=fastpath, placement=placement)
2686
2687 # TODO: flexible with index=None and/or items=None
/usr/local/lib/python3.5/site-packages/pandas/core/internals.py in __init__(self, values, ndim, fastpath, placement, **kwargs)
1815
1816 super(ObjectBlock, self).__init__(values, ndim=ndim, fastpath=fastpath,
-> 1817 placement=placement, **kwargs)
1818
1819 @property
/usr/local/lib/python3.5/site-packages/pandas/core/internals.py in __init__(self, values, placement, ndim, fastpath)
107 raise ValueError('Wrong number of items passed %d, placement '
108 'implies %d' % (len(self.values),
--> 109 len(self.mgr_locs)))
110
111 @property
ValueError: Wrong number of items passed 4, placement implies 1
Спасибо за помощь .. Это один дает мне много ложных срабатываний в тексте!. – tumbleweed
Я обновил свой вопрос и добавил альтернативную версию. – tumbleweed