锘??xml version="1.0" encoding="utf-8" standalone="yes"?>
閫氳繃浠ヤ笂浠g爜錛屾垜浠氨鍙互鎶奻ingerprint鐨勫艱綆楀嚭鏉ワ紝鐒跺悗瀛樺偍鍒癕ySQL鏁版嵁搴撲腑浜嗐?br />榪涜鐩鎬技搴︽悳绱㈢殑鏃跺欙紝鍊奸渶瑕佸彇鍑哄凡緇忓瓨鍌ㄧ殑鍊艱繘琛屾瘮瀵瑰氨鍙互浜嗐?br />
絎旇呮祴璇曚簡187586鏉$粨鏋勬暟鎹紝澶ф闇瑕?2縐掑乏鍙籌紝鍩烘湰婊¤凍涓鑸渶姹傘?/div>
]]>
鍏跺ぇ姒傛剰鎬濆氨鏄厛閫氳繃Fingerprint榪涜絳涢夛紝榪欐牱鍙互蹇熺殑絳涢夋帀涓閮ㄥ垎鏁版嵁錛屽浜庡鏉傜粨鏋勬洿鏈夋晥錛涘彟澶栧氨鏄牴鎹師瀛愪釜鏁版垨鑰呯壒孌婂師瀛愪釜鏁拌繘琛屾瘮杈冿紝濡傛灉鏌ヨ緇撴瀯鍖呭惈涓変釜“N”鍘熷瓙錛岄偅涔堟墍瑕佹煡璇㈠嚭鐨勭粨鏋勬墍鍚湁“N”鐨勪釜鏁板繀欏誨ぇ浜庣瓑浜?錛岃繖鏍峰浜庡寘鍚竴浜涚壒孌婂厓绱犵殑鏁堟灉鏄壒鍒殑濂斤紱榪樻湁灝辨槸鏍規(guī)嵁鍒嗗瓙鐨勪竴浜涙ц川榪涜絳涢夎繃婊わ紝姣斿鑺抽鎬х瓑錛涙渶鍚庡啀榪涜鍖歸厤錛岃繖鏍蜂竴鏉ュ浜庡鏉傜粨鏋勪互鍙?qiáng)鍚壱?guī)畩鍏冪礌鐨勬煡璇㈤熷害浼?xì)鎻愰珮寰堝銆?br /> 鏈鍚庢枃绔犱腑榪樼粰鍑烘祴璇曟暟鎹紝浠庝腑鍙互鐪嬪嚭錛岄熷害涓鑸彁楂樹簡涓夊嶅乏鍙籌細(xì)
Name
SMILES
Correct
FP
Triage
Before
After
Latest
Propane
CCC
65337
66352
42411
42.59
17.99
14.34
Selenium
[Se]
246
995
225
0.80
0.83
0.52
Benzene
c1ccccc1
79426
79486
50893
72.69
27.56
20.29
Methane
C
118519
118524
118511
61.29
5.47
4.25
Amido
NC=O
25695
26975
14702
18.89
9.84
8.16
Methylbenzene
Cc1ccccc1
54529
56869
20490
54.76
35.58
25.90
Carboxy
OC=O
33009
34369
17809
23.86
12.48
10.24
Chlorine
Cl
19424
23318
19424
11.23
1.38
1.12
Cyclopropane
C1CC1
863
4358
484
8.24
7.78
5.02
Biphenyl
c1ccccc1c2ccccc2
2967
5142
146
21.94
21.65
11.44
Dopamine
NCCc1ccc(O)c(O)c1
829
913
23
1.85
2.09
1.47
Sulfisoxazole
7
8
3
0.50
0.88
0.51
BetaCarotene
2
16
1
0.48
0.68
0.58
Nitrofurantoin
0
0
0
0.42
0.58
0.52
]]>
]]>
]]>
]]>
import Java.util.BitSet;
import org.openscience.cdk.DefaultChemObjectBuilder;
import org.openscience.cdk.exception.CDKException;
import org.openscience.cdk.exception.InvalidSmilesException;
import org.openscience.cdk.fingerprint.ExtendedFingerprinter;
import org.openscience.cdk.smiles.SmilesParser;
public class FingerprinterTest {
/**
* @param args
* @throws CDKException
* @throws InvalidSmilesException
*/
public static void main(String[] args) throws InvalidSmilesException, CDKException {
ExtendedFingerprinter fingerprinter = new ExtendedFingerprinter();
SmilesParser sp = new SmilesParser(DefaultChemObjectBuilder.getInstance());
BitSet bt = fingerprinter.getFingerprint(sp.parseSmiles("c2ccc1ccccc1c2"));
}
}
import javax.servlet.http.HttpServletResponse;
import javax.vecmath.Point2d;
import org.apache.log4j.Logger;
import org.openscience.cdk.Molecule;
import org.openscience.cdk.interfaces.IAtom;
import org.openscience.cdk.interfaces.IMolecule;
import org.openscience.cdk.io.MDLReader;
import org.openscience.cdk.layout.StructureDiagramGenerator;
import org.openscience.cdk.renderer.Renderer2DModel;
import org.openscience.cdk.renderer.SimpleRenderer2D;
public class ImageTypeExporterUtil {
private static final Logger logger = Logger.getLogger(ImageTypeExporterUtil.class);
/**
* show molecule structure to image type (png, jpeg)
*
* @param mol String molecule stucture
* @param length width and height
* @param response HttpServletResponse object
* @throws Exception
* if occurred exception ,then throw Exception
*/
public static void showAsImage(String stucture, Integer length, HttpServletResponse response) throws Exception {
logger.debug("ImageTypeExporterUtil.showAsImage..");
StringReader mdl = new StringReader(stucture);
MDLReader cdkMDL = new MDLReader(mdl);
Molecule mol = new Molecule();
cdkMDL.read(mol);
// null coordinates
Iterator<IAtom> itatoms = mol.atoms();
while (itatoms.hasNext()) {
IAtom atom = itatoms.next();
atom.setPoint2d(null);
atom.setPoint3d(null);
}
// generate 2D coordinates
StructureDiagramGenerator sdg = new StructureDiagramGenerator();
sdg.setMolecule(mol);
try {
sdg.generateCoordinates();
} catch (Exception ex) {
ex.printStackTrace();
}
IMolecule layedOutMol = sdg.getMolecule();
// scale molecule
final double UNDEF_POS = 100000;
double minX = UNDEF_POS, minY = UNDEF_POS, maxX = UNDEF_POS, maxY = UNDEF_POS;
itatoms = layedOutMol.atoms();
while (itatoms.hasNext()) {
IAtom atom = itatoms.next();
Point2d point2d = atom.getPoint2d();
if (minX == UNDEF_POS || minX > point2d.x)
minX = point2d.x;
if (minY == UNDEF_POS || minY > point2d.y)
minY = point2d.y;
if (maxX == UNDEF_POS || maxX < point2d.x)
maxX = point2d.x;
if (maxY == UNDEF_POS || maxY < point2d.y)
maxY = point2d.y;
}
double scaleX = length / (maxX - minX + 1);
double scaleY = length / (maxY - minY + 1);
double scale = scaleX > scaleY ? scaleY : scaleX;
double centreX = scale * (maxX + minX) / 2.;
double centreY = scale * (maxY + minY) / 2.;
double offsetX = length / 2. - centreX;
double offsetY = length / 2. - centreY;
itatoms = layedOutMol.atoms();
while (itatoms.hasNext()) {
IAtom atom = itatoms.next();
Point2d a = atom.getPoint2d();
Point2d b = new Point2d();
b.x = a.x * scale + offsetX;
b.y = a.y * scale + offsetY;
atom.setPoint2d(b);
}
// set rendering properties
Renderer2DModel r2dm = new Renderer2DModel();
r2dm.setDrawNumbers(false);
r2dm.setUseAntiAliasing(true);
r2dm.setColorAtomsByType(true);
r2dm.setShowAtomTypeNames(false);
r2dm.setShowAromaticity(true);
r2dm.setShowImplicitHydrogens(false);
r2dm.setShowReactionBoxes(false);
r2dm.setKekuleStructure(false);
Dimension dim = new Dimension();
dim.setSize(length, length);
r2dm.setBackgroundDimension(dim);
r2dm.setBackColor(java.awt.Color.WHITE);
// render the image
SimpleRenderer2D renderer = new SimpleRenderer2D();
renderer.setRenderer2DModel(r2dm);
BufferedImage bufferedImage = new BufferedImage(length, length,
BufferedImage.TYPE_INT_RGB);
Graphics2D graphics = bufferedImage.createGraphics();
graphics.setPaint(java.awt.Color.WHITE);
Rectangle2D.Float rectangle = new Rectangle2D.Float(0, 0, length, length);
graphics.fill(rectangle);
renderer.paintMolecule(layedOutMol, graphics);
// write the image to response
response.setContentType("image/png");
OutputStream out = response.getOutputStream();
try {
javax.imageio.ImageIO.write(bufferedImage, "png", out);
} finally {
out.close();
}
}
}
package com.founder.cdk;
import Java.io.File;
import Java.io.FileNotFoundException;
import Java.io.FileReader;
import Java.util.ArrayList;
import Java.util.List;
import org.openscience.cdk.ChemFile;
import org.openscience.cdk.ChemObject;
import org.openscience.cdk.exception.CDKException;
import org.openscience.cdk.interfaces.IAtomContainer;
import org.openscience.cdk.io.MDLV2000Reader;
import org.openscience.cdk.smiles.smarts.SMARTSQueryTool;
import org.openscience.cdk.tools.manipulator.ChemFileManipulator;
public class SMARTSQueryToolTest {
static SMARTSQueryTool sqt;static {
try {
sqt = new SMARTSQueryTool("c2ccc1ccccc1c2");
} catch (CDKException e) {
}
}
/**
* @param args
*/
public static void main(String[] args) {
String filename = "H:\\molecules.sdf";
try {
MDLV2000Reader reader = new MDLV2000Reader(new FileReader(new File(filename)));
ChemFile chemFile = (ChemFile) reader.read((ChemObject) new ChemFile());
List<IAtomContainer> containersList = ChemFileManipulator.getAllAtomContainers(chemFile);
List<IAtomContainer> substructureList = new ArrayList<IAtomContainer>();
sqt.setSmarts("c1ccc3c(c1)ccc4c2ccccc2ccc34"); //閲嶆柊璁劇疆鍖歸厤鐨剆miles鍊?br />
boolean matched = false;
for (IAtomContainer molecule : containersList) {
matched = sqt.matches(molecule);
if (matched){
substructureList.add(molecule);
}
}
System.out.println(substructureList.size());
for (IAtomContainer molecule : substructureList) {
System.out.println(molecule.getProperty("ID"));
}
} catch (CDKException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
}
閫氳繃嫻嬭瘯, matches鏂規(guī)硶閫熷害寰堟參, 涓鑸竴涓粨鏋勯渶瑕?00ms-1000ms宸﹀彸.
import Java.io.File;
import Java.io.FileNotFoundException;
import Java.io.FileReader;
import Java.util.List;
import org.openscience.cdk.ChemFile;
import org.openscience.cdk.ChemObject;
import org.openscience.cdk.Molecule;
import org.openscience.cdk.exception.CDKException;
import org.openscience.cdk.interfaces.IAtomContainer;
import org.openscience.cdk.io.MDLReader;
import org.openscience.cdk.io.MDLV2000Reader;
import org.openscience.cdk.tools.manipulator.ChemFileManipulator;
public class ReadSDFTest {
/**
* @param args
* @throws CDKException
* @throws FileNotFoundException
*/
public static void main(String[] args) throws CDKException, FileNotFoundException {
String filename = "H:\\molecules.sdf";
// InputStream ins = ReadSDFTest.class.getClassLoader().getResourceAsStream(filename);
// MDLReader reader = new MDLReader(ins);
//alternatively, you can specify a file directly
MDLV2000Reader reader = new MDLV2000Reader(new FileReader(new File(filename)));
ChemFile chemFile = (ChemFile)reader.read((ChemObject)new ChemFile());
List<IAtomContainer> containersList = ChemFileManipulator.getAllAtomContainers(chemFile);
Molecule molecule = null;
for (IAtomContainer mol : containersList) {
molecule = (Molecule) mol;
System.out.println(molecule.getProperties());
System.out.println(molecule.getProperty("CD_MOLWEIGHT"));
// Fingerprinter fp = new Fingerprinter();
// BitSet bt = fp.getFingerprint(molecule);
// System.out.println(bt);
}
}
}
import Java.io.StringReader;
import Java.sql.Connection;
import Java.sql.ResultSet;
import Java.sql.SQLException;
import Java.util.ArrayList;
import Java.util.BitSet;
import Java.util.List;
import org.openscience.cdk.Molecule;
import org.openscience.cdk.exception.CDKException;
import org.openscience.cdk.fingerprint.Fingerprinter;
import org.openscience.cdk.io.MDLReader;
import org.openscience.cdk.similarity.Tanimoto;
public class CDKTest {
/**
* @param args
*/
public static void main(String[] args) {
// MySQL
long t1 = System.currentTimeMillis();
try {
Class.forName("com.mysql.jdbc.Driver").newInstance();
Connection con = Java.sql.DriverManager
.getConnection(
"jdbc:mysql://localhost/coocoo?useUnicode=true&characterEncoding=utf-8&zeroDateTimeBehavior=convertToNull",
"root", "root");
ResultSet results = null;
String querySQL = "select id, structure from structure ";
results = con.createStatement().executeQuery(querySQL);
// dump out the results
List<Molecule> list = new ArrayList<Molecule>();
Fingerprinter fp = new Fingerprinter();
BitSet bt = null;
while (results.next()) {
Long id = results.getLong("id");
//鏍規(guī)嵁緇撴瀯鏁版嵁鐢熸垚鍒嗗瓙瀵硅薄
StringReader mdl = new StringReader(results.getString("structure"));
MDLReader cdkMDL = new MDLReader(mdl);
Molecule molecule = new Molecule();
cdkMDL.read(molecule);
if (id == 1220) {
bt = fp.getFingerprint(molecule);
}
list.add(molecule);
}
System.out.println("size:=" + list.size());
List<Molecule> resultList = new ArrayList<Molecule>();
long t2 = System.currentTimeMillis();
System.out.println("Thread: collection data in " + (t2 - t1) + " ms.");
for (Molecule molecule : list) {
try {
float coefficient = Tanimoto.calculate(fp.getFingerprint(molecule), bt); //璁$畻鐩鎬技搴?br />
if (coefficient > 0.9) {
resultList.add(molecule);
}
} catch (CDKException e) {
}
}
long t3 = System.currentTimeMillis();
System.out.println(resultList.size());
System.out.println("Thread: Search in " + (t3 - t2) + " ms.");
con.close();
} catch (InstantiationException e) {
e.printStackTrace();
} catch (IllegalAccessException e) {
e.printStackTrace();
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (SQLException e) {
e.printStackTrace();
} catch (CDKException e) {
e.printStackTrace();
}
long t4 = System.currentTimeMillis();
System.out.println("Thread: all in " + (t4 - t1) + " ms.");
}
}
Rich Apodaca wrote a great serious posts named Fast Substructure Search Using Open Source Tools providing details on substructure search with MySQL. But, however, poor binary data operation functions of MySQL limited the implementation of similar structure search which typically depends on the calculation of Tanimato coefficient. We are going to use Java & CDK to add this feature.
As default output of CDK fingerprint, java.util.BitSet with Serializable interface is perfect data format of fingerprint data storage. Java itself provides several collections such as ArrayList, LinkedList, Vector class in package Java.util. To provide web access to the search engine, thread unsafe ArrayList and LinkedList have to be kicked out. How about Vector? Once all the fingerprint data is well prepared, the collection function we need to do similarity search is just iteration. No add, no delete. So, a light weight array is enough.
Most of the molecule information is stored in MySQL database, so we are going to map fingerprint to corresponding row in data table. Here is the MolDFData class, we use a long variable to store corresponding primary key in data table.
public class MolDFData implements Serializable {
private long id;
private BitSet fingerprint;
public MolDFData(long id, BitSet fingerprint) {
this.id = id;
this.fingerprint = fingerprint;
}
public long getId() {
return id;
}
public void setId(long id) {
this.id = id;
}
public BitSet getFingerprint() {
return fingerprint;
}
public void setFingerprint(BitSet fingerprint) {
this.fingerprint = fingerprint;
}
}
This is how we storage our fingerprints.
private MolFPData[] arrayData;
No big deal with similarity search. Just calculate the Tanimoto coefficient, if it’s bigger than minimal similarity you set, add this one into result.
public List searchTanimoto(BitSet bt, float minSimlarity) {
List resultList = new LinkedList();
int i;
for (i = 0; i < arrayData.length; i++) {
MolDFData aListData = arrayData[i];
try {
float coefficient = Tanimoto.calculate(aListData.getFingerprint(), bt);
if (coefficient > minSimlarity) {
resultList.add(new SearchResultData(aListData.getId(), coefficient));
}
} catch (CDKException e) {
}
Collections.sort(resultList);
}
return resultList;
}
Pretty ugly code? Maybe. But it really works, at a acceptable speed.
Tests were done using the code blow on a macbook(Intel Core Due 1.83 GHz, 2G RAM).
long t3 = System.currentTimeMillis();
List<SearchResultData> listResult = se.searchTanimoto(bs, 0.8f);
long t4 = System.currentTimeMillis();
System.out.println("Thread: Search done in " + (t4 - t3) + " ms.");
In my database of 87364 commercial compounds, it takes 335 ms.
import org.openscience.cdk.annotations.TestClass;
import org.openscience.cdk.annotations.TestMethod;
import org.openscience.cdk.exception.CDKException;
import Java.util.BitSet;
/**
* Calculates the Tanimoto coefficient for a given pair of two
* fingerprint bitsets or real valued feature vectors.
*
* The Tanimoto coefficient is one way to
* quantitatively measure the "distance" or similarity of
* two chemical structures.
*
* <p>You can use the FingerPrinter class to retrieve two fingerprint bitsets.
* We assume that you have two structures stored in cdk.Molecule objects.
* A tanimoto coefficient can then be calculated like:
* <pre>
* BitSet fingerprint1 = Fingerprinter.getFingerprint(molecule1);
* BitSet fingerprint2 = Fingerprinter.getFingerprint(molecule2);
* float tanimoto_coefficient = Tanimoto.calculate(fingerprint1, fingerprint2);
* </pre>
*
* <p>The FingerPrinter assumes that hydrogens are explicitely given, if this
* is desired!
* <p>Note that the continuous Tanimoto coefficient does not lead to a metric space
*
*@author steinbeck
* @cdk.githash
*@cdk.created 2005-10-19
*@cdk.keyword jaccard
*@cdk.keyword similarity, tanimoto
* @cdk.module fingerprint
*/
@TestClass("org.openscience.cdk.similarity.TanimotoTest")
public class Tanimoto
{
/**
* Evaluates Tanimoto coefficient for two bit sets.
*
* @param bitset1 A bitset (such as a fingerprint) for the first molecule
* @param bitset2 A bitset (such as a fingerprint) for the second molecule
* @return The Tanimoto coefficient
* @throws org.openscience.cdk.exception.CDKException if bitsets are not of the same length
*/
@TestMethod("testTanimoto1,testTanimoto2")
public static float calculate(BitSet bitset1, BitSet bitset2) throws CDKException
{
float _bitset1_cardinality = bitset1.cardinality();
float _bitset2_cardinality = bitset2.cardinality();
if (bitset1.size() != bitset2.size()) {
throw new CDKException("Bisets must have the same bit length");
}
BitSet one_and_two = (BitSet)bitset1.clone();
one_and_two.and(bitset2);
float _common_bit_count = one_and_two.cardinality();
return _common_bit_count/(_bitset1_cardinality + _bitset2_cardinality - _common_bit_count);
}
/**
* Evaluates the continuous Tanimoto coefficient for two real valued vectors.
*
* @param features1 The first feature vector
* @param features2 The second feature vector
* @return The continuous Tanimoto coefficient
* @throws org.openscience.cdk.exception.CDKException if the features are not of the same length
*/
@TestMethod("testTanimoto3")
public static float calculate(double[] features1, double[] features2) throws CDKException {
if (features1.length != features2.length) {
throw new CDKException("Features vectors must be of the same length");
}
int n = features1.length;
double ab = 0.0;
double a2 = 0.0;
double b2 = 0.0;
for (int i = 0; i < n; i++) {
ab += features1[i] * features2[i];
a2 += features1[i]*features1[i];
b2 += features2[i]*features2[i];
}
return (float)ab/(float)(a2+b2-ab);
}
}
閫氳繃婧愮爜鍙互鐪嬪嚭calculate(BitSet bitset1, BitSet bitset2)鏂規(guī)硶,鏄氳繃姣旇緝涓や釜鍒嗗瓙鐨刦ingerprint鐨勪綅,鏉ヨ綆楃浉浼煎害.閫氳繃BitSet鐨刟nd鎿嶄綔寰楀埌鍏卞悓鐨勪釜鏁?鐒跺悗鍦ㄩ櫎浠ユ誨叡涓簍rue鐨勪釜鏁?榪欐牱灝卞緱鍒扮浉浼煎?
JChemPaint was started by Christoph Steinbeck in the late 1990's to be the complementary structure editor to Jmol. It was then co-developed by Egon Willighagen and others. Jmol again is a visualisation and analysis tool for 3D molecular structures, started by Dan Gezelter at Notre Dame University, initiator of the Open Science Project and, like JChemPaint, developed by an international team of opensource programmers.
In at least three aspects JChemPaint is different from other 2D editors: