[英]SQL Regex substr pattern match
好的,所以我向我認識的程序員詢問了以下問題,但沒有人能想出一種方法來做到這一點......如果可以,請提供幫助!
我正在為醫院程序進行模式匹配,在這個例子中,它將把 ¾ 詞從一個概念匹配到另一個。 基本上,我想讓“x,z,y”與“x,a,y,z”匹配(記住我已經刪除了所有字母數字字符,所以我可以這樣做。下面是一個很長的例子-手,我需要找到一種方法來根據字數使其動態化,而不是在每次迭代時都這樣做。例如:
'Spinal Fusion' = 'Fusion of the Spine'
'Mammogram-bilateral' = 'bilateral mammogram scan'
'Echocardiogram (ECG)' = 'ECG'
我寫了它是如何工作的,但其中一些有幾十次迭代,所以它需要是一種 case when 語句。 如果有人知道如何使這種動態,我將永遠感激
WHEN regexp_count(x.y,'(\w+)+') =4 and regexp_count(a.b,'(\w+)+') =3 – (when the count of words is = to 3 and 4)
AND (
( regexp_substr (x.y,'\w+\b',1,1) = regexp_substr (a.b,'\w+\b',1,1)
or regexp_substr (x.y,'\w+\b',1,1) = regexp_substr (a.b,'\w+\b',1,2)
or regexp_substr (x.y,'\w+\b',1,1) = regexp_substr (a.b,'\w+\b',1,3))
and(
regexp_substr (x.y,'\w+\b',1,2) = regexp_substr (a.b,'\w+\b',1,1)
or regexp_substr (x.y,'\w+\b',1,2) = regexp_substr (a.b,'\w+\b',1,2)
or regexp_substr (x.y,'\w+\b',1,2) = regexp_substr (a.b,'\w+\b',1,3))
and(
regexp_substr (x.y,'\w+\b',1,3) = regexp_substr (a.b,'\w+\b',1,1)
or regexp_substr (x.y,'\w+\b',1,3) = regexp_substr (a.b,'\w+\b',1,2)
or regexp_substr (x.y,'\w+\b',1,3) = regexp_substr (a.b,'\w+\b',1,3))
)
or
( regexp_substr (x.y,'\w+\b',1,2) = regexp_substr (a.b,'\w+\b',1,1)
or regexp_substr (x.y,'\w+\b',1,2) = regexp_substr (a.b,'\w+\b',1,2)
or regexp_substr (x.y,'\w+\b',1,2) = regexp_substr (a.b,'\w+\b',1,3))
and(
regexp_substr (x.y,'\w+\b',1,3) = regexp_substr (a.b,'\w+\b',1,1)
or regexp_substr (x.y,'\w+\b',1,3) = regexp_substr (a.b,'\w+\b',1,2)
or regexp_substr (x.y,'\w+\b',1,3) = regexp_substr (a.b,'\w+\b',1,3))
and(
regexp_substr (x.y,'\w+\b',1,4) = regexp_substr (a.b,'\w+\b',1,1)
or regexp_substr (x.y,'\w+\b',1,4) = regexp_substr (a.b,'\w+\b',1,2)
or regexp_substr (x.y,'\w+\b',1,4) = regexp_substr (a.b,'\w+\b',1,3))
or
( regexp_substr (x.y,'\w+\b',1,1) = regexp_substr (a.b,'\w+\b',1,1)
or regexp_substr (x.y,'\w+\b',1,1) = regexp_substr (a.b,'\w+\b',1,2)
or regexp_substr (x.y,'\w+\b',1,1) = regexp_substr (a.b,'\w+\b',1,3))
and(
regexp_substr (x.y,'\w+\b',1,3) = regexp_substr (a.b,'\w+\b',1,1)
or regexp_substr (x.y,'\w+\b',1,3) = regexp_substr (a.b,'\w+\b',1,2)
or regexp_substr (x.y,'\w+\b',1,3) = regexp_substr (a.b,'\w+\b',1,3))
and(
regexp_substr (x.y,'\w+\b',1,4) = regexp_substr (a.b,'\w+\b',1,1)
or regexp_substr (x.y,'\w+\b',1,4) = regexp_substr (a.b,'\w+\b',1,2)
or regexp_substr (x.y,'\w+\b',1,4) = regexp_substr (a.b,'\w+\b',1,3))
or
( regexp_substr (x.y,'\w+\b',1,1) = regexp_substr (a.b,'\w+\b',1,1)
or regexp_substr (x.y,'\w+\b',1,1) = regexp_substr (a.b,'\w+\b',1,2)
or regexp_substr (x.y,'\w+\b',1,1) = regexp_substr (a.b,'\w+\b',1,3))
and(
regexp_substr (x.y,'\w+\b',1,2) = regexp_substr (a.b,'\w+\b',1,1)
or regexp_substr (x.y,'\w+\b',1,2) = regexp_substr (a.b,'\w+\b',1,2)
or regexp_substr (x.y,'\w+\b',1,2) = regexp_substr (a.b,'\w+\b',1,3))
and(
regexp_substr (x.y,'\w+\b',1,4) = regexp_substr (a.b,'\w+\b',1,1)
or regexp_substr (x.y,'\w+\b',1,4) = regexp_substr (a.b,'\w+\b',1,2)
or regexp_substr (x.y,'\w+\b',1,4) = regexp_substr (a.b,'\w+\b',1,3))
)
THEN x.y = a.b
試試 Vertica 的文本索引包。
文檔在這里: https ://www.vertica.com/docs/9.3.x/HTML/Content/Authoring/AdministratorsGuide/Tables/TextSearch/TextSearchConceptual.htm ? tocpath = Administrator%27s%20Guide%7CUsing%20Text%20Search%7C_____0
這是一種可用於創建輔助表的方法,您最終可以將其與基表連接以獲得匹配的字符串:
DROP TABLE IF EXISTS textbase CASCADE;
CREATE TABLE textbase(
id INT NOT NULL PRIMARY KEY
, txt VARCHAR(32)
) UNSEGMENTED ALL NODES;
INSERT INTO textbase
SELECT 0,'Spinal Fusion'
UNION ALL SELECT 1,'Fusion of the Spine'
UNION ALL SELECT 2,'Mammogram - bilateral'
UNION ALL SELECT 3,'bilateral mammogram scan'
UNION ALL SELECT 4,'Echocardiogram (ECG)'
UNION ALL SELECT 5,'ECG'
;
COMMIT;
-- Work with the Vertica standard Text Index package
-- either write your own stemmer, which removes articles and prepositions
-- and typical suffixes, or do the below - adding a pre-stemmed column.
ALTER TABLE textbase ADD prestemmed VARCHAR(32) DEFAULT
REGEXP_REPLACE(
REGEXP_REPLACE(
REGEXP_REPLACE(
txt
-- remove articles
, ' the\b'
, ''
, 1
, 1
,'i'
)
-- remove prepositions
, ' of\b'
, ''
, 1
, 1
,'i'
)
-- remove "al" and "e" suffixes
, 'e\b|al\b'
, ''
, 1
, 1
,'i'
);
-- Create your text index
CREATE TEXT INDEX textindex ON textbase(id,prestemmed)
TOKENIZER v_txtindex.BasicLogTokenizer (LONG VARCHAR)
STEMMER v_txtindex.Stemmer(LONG VARCHAR)
;
-- The text index table joins to the INTEGER primary key of the base table using "doc_id"
-- and has one row per token / keyword
SELECT * FROM textbase JOIN textindex ON id=doc_id ORDER BY doc_id;
-- out id | txt | prestemmed | token | doc_id
-- out ----+--------------------------+------------------------+----------------+--------
-- out 0 | Spinal Fusion | Spin Fusion | spin | 0
-- out 0 | Spinal Fusion | Spin Fusion | fusion | 0
-- out 1 | Fusion of the Spine | Fusion Spin | spin | 1
-- out 1 | Fusion of the Spine | Fusion Spin | fusion | 1
-- out 2 | Mammogram - bilateral | Mammogram - bilater | mammogram | 2
-- out 2 | Mammogram - bilateral | Mammogram - bilater | bilat | 2
-- out 3 | bilateral mammogram scan | bilater mammogram scan | scan | 3
-- out 3 | bilateral mammogram scan | bilater mammogram scan | mammogram | 3
-- out 3 | bilateral mammogram scan | bilater mammogram scan | bilat | 3
-- out 4 | Echocardiogram (ECG) | Echocardiogram (ECG) | echocardiogram | 4
-- out 4 | Echocardiogram (ECG) | Echocardiogram (ECG) | ecg | 4
使用上面的文本索引,您可以通過計算單詞與匹配單詞來應用 3-of-4 關鍵字匹配,創建一個可以再次與基表連接的內聯表:
WITH -- count number of tokens per doc_id ...
wcount AS (
SELECT
doc_id
, count(*) AS wcount
FROM textindex
GROUP BY 1
)
,
-- count how many matches in tokens we have, where the "doc_id" is not equal ...
-- and, counting these, we have over 75% of the total tokens matching
matchcount AS (
SELECT
a.doc_id AS a_doc_id
, b.doc_id AS b_doc_id
, count(*) AS matchcount
FROM textindex a
JOIN textindex b USING (token)
WHERE a.doc_id <> b.doc_id
GROUP BY
1
, 2
HAVING count(*) > (SELECT wcount * .75 FROM wcount WHERE doc_id = a.doc_id)
)
SELECT
QUOTE_LITERAL(a.txt) ||' is probably equal to '||QUOTE_LITERAL(b.txt) AS assumption
FROM matchcount
JOIN textbase a ON a.id=a_doc_id
JOIN textbase b ON b.id=b_doc_id
;
-- out assumption
-- out -------------------------------------------------------------------------
-- out 'Spinal Fusion' is probably equal to 'Fusion of the Spine'
-- out 'Fusion of the Spine' is probably equal to 'Spinal Fusion'
-- out 'Mammogram - bilateral' is probably equal to 'bilateral mammogram scan'
-- out 'ECG' is probably equal to 'Echocardiogram (ECG)'
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.