
点击图片 即可查看课程详情
本文中,我将展示如何使用图形技术和一些编程来构建和探索自己文章的内容。

-
让 API 工作并通过 Python 访问它。 -
使用示例文本进行提示工程,确保 GPT-4 模型理解您想要从中得到什么。 -
下载您指定的文章,并预处理数据。 -
从 ChatGPT 中提取并收集输出。 -
对 ChatGPT 的输出进行后处理 -
使用 Cypher 查询语言编写代码以将数据进一步构造成图表。 -
和你最好的新朋友一起玩,探索你的文章。
entities = ["Mathematical entity", "Person", "Location", "Animal", "Activity", "Programming language", "Equation", "Date", "Shape", "Property", "Mathematical expression", "Profession", "Time period", "Mathematical subject", "Mathematical concept", "Discipline", "Mathematical theorem", "Physical entity", "Physics subject", "Physics"]relationships = ["IS", "ARE", "WAS", "EQUIVALENT_TO", "CONTAINS", "PROPOSED", "PARTICIPATED_IN", "SOLVED", "RELATED_TO", "CORRESPONDS_TO", "HAS_PROPERTY", "REPRESENTS", "IS_USED_IN", "DISCOVERED", "FOUND", "IS_SOLUTION_TO", "PROVED", "LIVED_IN", "LIKED", "BORN_IN", "CONTRIBUTED_TO", "IMPLIES", "DESCRIBES", "DEVELOPED", "HAS_PROPERTY", "USED_FOR"]prompt = f"""You are a mathematician and a scientist helping us extract relevant information from articles about mathematics.The task is to extract as many relevant relationships between entities to mathematics, physics, or history and science in general as possible.The entities should include all persons, mathematical entities, locations etc.Specifically, the only entity tags you may use are:{', '.join(entities)}.The only relationships you may use are:{', '.join(relationships)}As an example, if the text is "Euler was located in Sankt Petersburg in the 17 hundreds", the output should have the following format: Euler: Person, LIVED_IN, Skt. Petersburg: LocationIf we have "In 1859, Riemann proved Theorem A", then as an output you should return Riemann: Person, PROVED, Theorem A: Mathematical theoremI am only interested in the relationships in the above format and you can only use what you find in the text provided. Also, you should not provide relationships already found and you should choose less than 100 relationships and the most important ones.You should only take the most important relationships as the aim is to build a knowledge graph. Rather a few but contextual meaningful than many nonsensical.Moreover, you should only tag entities with one of the allowed tags if it truly fits that category and I am only interested in general entities such as "Shape HAS Area" rather than "Shape HAS Area 1".The input text is the following:"""
import osimport openaifrom prompt_input import promptopenai.api_key = "<Your API key goes here>"def process_gpt4(text):"""This function prompts the gpt-4 model and returns the output"""response = openai.ChatCompletion.create(model="gpt-4",temperature=0,messages=[{"role": "user", "content": prompt + text},],)result = response['choices'][0]['message']['content']return result
from bs4 import BeautifulSoupdef extract_text_from_html(html_content):"""This function extracts the text from the articles"""soup = BeautifulSoup(html_content, 'html.parser')for script in soup(["script", "style"]):script.extract()article_tag = soup.find('article')if article_tag:return " ".join(article_tag.stripped_strings)
def text_to_batches(s, batch_size=2000):words = s.split()batches = []for i in range(0, len(words), batch_size):batch = ' '.join(words[i:i+batch_size])batches.append(batch)return batches
import osfrom tqdm import tqdmfrom connect import process_gpt4from extract_text import extract_text_from_htmlfrom preprocess import text_to_batchesbase_path = 'raw'processed_articles = os.listdir('data')for file_name in tqdm(os.listdir(base_path)):title = ' '.join(file_name.split('_')[-1].split('-')[:-1])if f'results_{title}.txt' in processed_articles:continueresults = ''with open(os.path.join(base_path, file_name), 'r', encoding='utf-8') as f:content = f.read()extraction = extract_text_from_html(content)batches = text_to_batches(extraction)for batch in batches:gpt_results = process_gpt4(batch)results += gpt_resultswith open(f'data/results_{title}.txt', 'w', encoding='utf-8') as results_file:results_file.write(results)with open(f'cleaned/cleaned_{title}.txt', 'w', encoding='utf-8') as cleaned_file:cleaned_file.write(extraction)
from neo4j import GraphDatabaseclass LoadGraphData:def __init__(self, username, password, uri):self.username = usernameself.password = passwordself.uri = uriself.driver = GraphDatabase.driver(self.uri, auth=(self.username, self.password))def create(self, query):with self.driver.session() as graphDB_Session:return graphDB_Session.run(query)def set_max_nodes(self, number):query = f":config initialNodeDisplay: {number}"with self.driver.session() as graphDB_Session:return graphDB_Session.run(query)def delete_graph(self):delete = "MATCH (n) DETACH DELETE n"with self.driver.session() as graphDB_Session:graphDB_Session.run(delete)@staticmethoddef do_cypher_tx(tx, cypher):result = tx.run(cypher)values = []for record in result:values.append(record.values())return valuesdef work_with_data(self, query):with self.driver.session() as session:values = session.read_transaction(self.do_cypher_tx, query)return values
import osimport refrom prompt_input import entities, relationshipsfrom loader import LoadGraphDatafrom tqdm import tqdmdef create_relationships(loader, title, e1, l1, e2, l2, R):query = f'MERGE (:Article {{name: "{title}"}})\MERGE (:{l1} {{name: "{e1}"}})\MERGE (:{l2} {{name: "{e2}"}})'loader.create(query)query = f'MATCH (t:Article {{name: "{title}"}})\MATCH (a:{l1} {{name: "{e1}"}})\MATCH (b:{l2} {{name: "{e2}"}})\MERGE (a)-[:{R}]->(b)\MERGE (a)-[:IN_ARTICLE]->(t)\MERGE (b)-[:IN_ARTICLE]->(t)'loader.create(query)def make_graph(source, cleaned):loader = LoadGraphData("neo4j", "<password>", "bolt://localhost:7687")loader.delete_graph()history = []for results in tqdm(os.listdir(source)):with open(os.path.join(source, results)) as r:content = r.read()lines = content.split('\n')if len(lines) < 10:continuewith open(os.path.join(cleaned, 'cleaned_' + '_'.join(results.split('_')[1:]))) as c:cleaned_content = c.read()for line in lines:line = re.sub('^\d+\.', '', line).strip()splitted = line.split(',')if len(splitted) == 3:A = splitted[0]R = splitted[1].strip()B = splitted[2]if not ':' in A or not ':' in B:continuee1, l1 = A.split(':')[0], A.split(':')[1]e2, l2 = B.split(':')[0], B.split(':')[1]e1, e2, l1, l2 = e1.strip(), e2.strip(), l1.strip(), l2.strip()if e1.lower() not in cleaned_content.lower() or e2.lower() not in cleaned_content.lower():continueif l1 == 'Person':for subname in e1.split()[::-1]:if subname[0].upper() == subname[0]:e1 = subnamebreakif l2 == 'Person':for subname in e2.split()[::-1]:if subname[0].upper() == subname[0]:e2 = subnamebreakif R == R.upper() and R in relationships and l1 in entities and l2 in entities and len(e1.split()) < 5 and len(e1) > 1 and len(e2.split()) < 5 and len(e2) > 1 and e1 != e2:if line not in history:history.append(line)l1 = l1.replace(" ", "_")l2 = l2.replace(" ", "_")e1 = e1.replace('"', '')e2 = e2.replace('"', '')title = results.split('.')[0].replace(' ', '_')title = '_'.join(title.split('_')[1:])create_relationships(loader=loader, title=title, e1=e1, l1=l1, e2=e2, l2=l2, R=R)
from make_graph import make_graphmake_graph(source='data', cleaned='cleaned')
⚪来源: PikeTalk
⚪以上图文,贵在分享,用于学术教育之用,版权归原作者及原出处所有。
线上线下 • 9-12月
【9月15日-18日 • 太原】基于学科体系的知识图谱助力教学改革与教学模式创新研修班
【9月22日-25日 • 济南】课程思政示范课申报设计暨课程思政教学设计工作坊
【9月22日-25日 • 厦门】锻造金师 打造金课——以全国高校教师教学创新大赛为引领的教学竞赛作品培育工作坊
【9月22日-25日 • 济南】职业教育一流核心课程申报及精品在线课程建设专题培训班
【10月13日-16日 • 青岛】探索未来教学新路径:以ChatGPT为代表的AI技术在教学资源制作中的创新应用实操训练营
【10月13日-16日 • 沈阳】职业院校新一轮双高专业群申报及建设方案的设计与撰写实战研修班
【10月13日-16日 • 济南】班级游戏化管理创新大课堂暨班主任工作室建设培训班
【10月27日-30日 • 成都】高等院校“四新建设”背景下基于OBE理念的人才培养方案及教学大纲修订专题培训班
【10月27日-30日 • 广州】新时期高素质教学队伍建设与教学管理模式改革创新培训班
【11月10日-14日 •重庆】职业教育一流核心课程申报及精品在线课程与专业教学资源库建设培训班
【11月10日-13日 •重庆】备战2024——全国高校教师教学创新大赛备赛实战与案例解析专题培训班
【11月10日-13日 •南京】高校青年教师教学竞赛备战专题培训班
【11月24日-27日 •西安】数字化赋能教学创新——助力教师信息化素养提升实务培训班
【11月24日-27日 •成都】2024版工程认证自评报告变化解读及撰写思路、申请书材料准备及撰写技巧专题班
【12月8日-11日 •南京】备战2024年教学能力比赛——全国职业院校2023年教学能力比赛解析与获奖教师现场决赛经验分享高级研修班
【12月15日-18日 •长沙】高校教育科研课题申报与研究科研论文撰写与发表及教学成果奖申报技巧培训班
热 人工智能相关分享

